2024-12-07 02:44:45 +08:00
|
|
|
import base64
|
|
|
|
import httpx
|
2024-12-07 14:03:17 +08:00
|
|
|
from django.core.management.base import BaseCommand
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
from io import BytesIO
|
|
|
|
from documents.models import Document
|
|
|
|
from PIL import Image
|
|
|
|
import pytesseract
|
|
|
|
import fitz
|
|
|
|
import os
|
|
|
|
from config.settings import MEDIA_ROOT
|
|
|
|
from watchdog.observers import Observer
|
|
|
|
from watchdog.events import FileSystemEventHandler
|
|
|
|
from documents.models import Document
|
2024-12-07 02:44:45 +08:00
|
|
|
from config.settings import get_secret
|
2024-11-28 13:41:58 +08:00
|
|
|
from django.core.files import File
|
2024-11-27 00:32:28 +08:00
|
|
|
import logging
|
|
|
|
import time
|
2024-12-07 02:44:45 +08:00
|
|
|
from ollama import Client
|
2024-12-18 17:05:44 +08:00
|
|
|
from pydantic import BaseModel
|
2025-01-09 00:43:55 +08:00
|
|
|
from datetime import date, datetime
|
2024-12-18 17:05:44 +08:00
|
|
|
from typing import Optional
|
2025-01-09 00:43:55 +08:00
|
|
|
import calendar
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
|
|
|
|
class PDFHandler(FileSystemEventHandler):
|
|
|
|
def __init__(self):
|
2024-12-04 01:29:30 +08:00
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.INFO,
|
|
|
|
format="%(asctime)s - %(message)s",
|
|
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
|
|
)
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
self.logger.info("Starting Document Watcher...")
|
|
|
|
|
|
|
|
def on_created(self, event):
|
|
|
|
if event.is_directory:
|
|
|
|
return None
|
|
|
|
|
2024-12-04 01:29:30 +08:00
|
|
|
if event.src_path.endswith(".pdf"):
|
2024-11-27 00:32:28 +08:00
|
|
|
self.logger.info(f"New PDF file detected: {event.src_path}")
|
|
|
|
self.process_pdf(event.src_path)
|
|
|
|
|
|
|
|
def process_pdf(self, file_path):
|
|
|
|
try:
|
2024-12-07 02:44:45 +08:00
|
|
|
# Get the original filename and directory
|
|
|
|
original_filename = os.path.basename(file_path)
|
|
|
|
original_dir = os.path.dirname(file_path)
|
|
|
|
|
|
|
|
# Check if the filename contains spaces
|
|
|
|
if " " in original_filename:
|
|
|
|
# Create the new filename by replacing spaces
|
|
|
|
new_filename = original_filename.replace(" ", "_")
|
|
|
|
|
|
|
|
# Construct the new full file path
|
|
|
|
new_file_path = os.path.join(original_dir, new_filename)
|
|
|
|
|
|
|
|
# Rename the file
|
|
|
|
os.rename(file_path, new_file_path)
|
|
|
|
|
|
|
|
# Update the filename and file_path variables
|
|
|
|
filename = new_filename
|
|
|
|
file_path = new_file_path
|
|
|
|
else:
|
|
|
|
filename = original_filename
|
2024-11-27 00:32:28 +08:00
|
|
|
metadata = ""
|
|
|
|
document_type = ""
|
|
|
|
|
|
|
|
with fitz.open(file_path) as doc:
|
|
|
|
num_pages = len(doc)
|
|
|
|
|
2025-01-08 13:38:39 +08:00
|
|
|
# Perform OCR only on the first page
|
|
|
|
page = doc[0]
|
|
|
|
pix = page.get_pixmap(matrix=(1.2, 1.2))
|
|
|
|
|
|
|
|
# Convert pixmap to bytes
|
|
|
|
img_bytes = pix.tobytes()
|
|
|
|
|
|
|
|
# Create a BytesIO object
|
|
|
|
img_buffer = BytesIO(img_bytes)
|
|
|
|
|
|
|
|
# Create a PIL Image object from the bytes
|
|
|
|
img = Image.open(img_buffer)
|
|
|
|
|
|
|
|
# Perform OCR
|
|
|
|
text = pytesseract.image_to_string(img).strip()
|
|
|
|
|
|
|
|
# Try to pass image to the Ollama image recognition API first
|
|
|
|
try:
|
|
|
|
client = Client(
|
|
|
|
host=get_secret("OLLAMA_URL"),
|
|
|
|
auth=httpx.BasicAuth(
|
|
|
|
username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None,
|
|
|
|
)
|
|
|
|
|
|
|
|
encoded_image = base64.b64encode(
|
|
|
|
img_buffer.getvalue()).decode()
|
|
|
|
|
2025-01-09 00:43:55 +08:00
|
|
|
# First LLM API call to determine category
|
|
|
|
class DocumentSchema(BaseModel):
|
|
|
|
category: str = "other"
|
|
|
|
explanation: Optional[str] = None
|
|
|
|
|
2025-01-08 13:38:39 +08:00
|
|
|
possible_categories = set((Document.objects.all().values_list(
|
|
|
|
"document_type", flat=True), "Documented Procedures Manual", "Form", "Special Order", "Memorandum"))
|
|
|
|
prompt = f"""
|
2025-01-09 00:43:55 +08:00
|
|
|
Read the text from the image and provide a document_type.
|
|
|
|
|
|
|
|
Possible document types are: {possible_categories}. You are free to create a new one if none are suitable.
|
2025-01-08 13:38:39 +08:00
|
|
|
|
2025-01-09 00:43:55 +08:00
|
|
|
If the document_type is Special Order or Memorandum, provide the sender of the document under sent_from.
|
2025-01-08 13:38:39 +08:00
|
|
|
|
2025-01-09 00:43:55 +08:00
|
|
|
Do all of this and return your output in JSON.
|
2025-01-08 13:38:39 +08:00
|
|
|
"""
|
2025-01-09 00:43:55 +08:00
|
|
|
|
2025-01-08 13:38:39 +08:00
|
|
|
response = client.chat(
|
|
|
|
model=get_secret("OLLAMA_MODEL"),
|
|
|
|
messages=[
|
|
|
|
{"role": "user",
|
|
|
|
"content": prompt,
|
|
|
|
"images": [encoded_image]},
|
|
|
|
],
|
2025-01-09 00:43:55 +08:00
|
|
|
format=DocumentSchema.model_json_schema(),
|
2025-01-08 13:38:39 +08:00
|
|
|
options={
|
|
|
|
"temperature": 0
|
|
|
|
},
|
|
|
|
)
|
2025-01-09 00:43:55 +08:00
|
|
|
result = DocumentSchema.model_validate_json(
|
|
|
|
response.message.content)
|
|
|
|
document_type = result.category
|
2025-01-08 13:38:39 +08:00
|
|
|
|
2025-01-09 00:43:55 +08:00
|
|
|
# Second LLM API call to determine other details
|
|
|
|
class DocumentSchema(BaseModel):
|
|
|
|
sent_from: str = "N/A"
|
|
|
|
subject: str = "N/A"
|
|
|
|
document_date: Optional[date]
|
|
|
|
explanation: Optional[str] = None
|
|
|
|
|
|
|
|
prompt = f"""
|
|
|
|
Determine who sent the document. Otherwise, return N/A.
|
|
|
|
|
|
|
|
Identify the subject or possible title of the document.
|
|
|
|
|
|
|
|
Return the date of the document if it exists.
|
|
|
|
|
|
|
|
Do all of this and return your output in JSON.
|
|
|
|
"""
|
|
|
|
response = client.chat(
|
|
|
|
model=get_secret("OLLAMA_MODEL"),
|
|
|
|
messages=[
|
|
|
|
{"role": "user",
|
|
|
|
"content": prompt,
|
|
|
|
"images": [encoded_image]},
|
|
|
|
],
|
|
|
|
format=DocumentSchema.model_json_schema(),
|
|
|
|
options={
|
|
|
|
"temperature": 0
|
|
|
|
},
|
|
|
|
)
|
|
|
|
result = DocumentSchema.model_validate_json(
|
2025-01-08 13:38:39 +08:00
|
|
|
response.message.content)
|
2025-01-09 00:43:55 +08:00
|
|
|
|
|
|
|
sent_from = result.sent_from
|
|
|
|
document_date = result.document_date
|
|
|
|
|
|
|
|
if document_date:
|
|
|
|
document_month = document_date.strftime("%B")
|
|
|
|
document_year = result.document_date.year
|
|
|
|
# Set as none for invalid dates
|
|
|
|
if document_year < 1980:
|
|
|
|
document_month = "no_month"
|
|
|
|
document_year = "no_year"
|
|
|
|
else:
|
|
|
|
document_month = "no_month"
|
|
|
|
document_year = "no_year"
|
2025-01-08 13:38:39 +08:00
|
|
|
|
|
|
|
# If that fails, just use regular OCR read the title as a dirty fix/fallback
|
|
|
|
except Exception as e:
|
2025-01-09 00:43:55 +08:00
|
|
|
document_type = "other"
|
|
|
|
sent_from = "N/A"
|
|
|
|
document_month = "no_month"
|
|
|
|
document_year = "no_year"
|
|
|
|
|
2025-01-08 13:38:39 +08:00
|
|
|
self.logger.warning(f"Error! {e}")
|
|
|
|
self.logger.warning(
|
2025-01-09 00:43:55 +08:00
|
|
|
"Ollama OCR offload failed. Using defaults for missing values")
|
2025-01-08 13:38:39 +08:00
|
|
|
|
|
|
|
metadata += text
|
2024-11-27 00:32:28 +08:00
|
|
|
|
2024-11-28 13:41:58 +08:00
|
|
|
# Open the file for instance creation
|
2025-01-09 00:43:55 +08:00
|
|
|
DOCUMENT = Document.objects.filter(
|
|
|
|
name=filename.replace(".pdf", "")).first()
|
|
|
|
if not DOCUMENT:
|
|
|
|
DOCUMENT = Document.objects.create(
|
|
|
|
name=filename.replace(".pdf", ""),
|
|
|
|
number_pages=num_pages,
|
|
|
|
ocr_metadata=metadata,
|
|
|
|
document_type=document_type,
|
|
|
|
sent_from=sent_from,
|
|
|
|
document_month=document_month,
|
|
|
|
document_year=document_year
|
|
|
|
)
|
|
|
|
|
2024-12-04 02:51:57 +08:00
|
|
|
DOCUMENT.file.save(
|
|
|
|
name=filename, content=File(open(file_path, "rb")))
|
2025-01-09 00:43:55 +08:00
|
|
|
|
2024-12-04 01:29:30 +08:00
|
|
|
self.logger.info(
|
2024-12-18 17:05:44 +08:00
|
|
|
f"Document '{filename}' created successfully with type '{
|
2025-01-09 00:43:55 +08:00
|
|
|
document_type}'. sent_from: {sent_from}, document_month: {document_month}, document_year: {document_year}"
|
2024-12-04 01:29:30 +08:00
|
|
|
)
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
else:
|
|
|
|
self.logger.info(f"Document '{filename}' already exists.")
|
|
|
|
|
|
|
|
os.remove(file_path)
|
|
|
|
except Exception as e:
|
|
|
|
self.logger.error(f"Error processing PDF: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
class PDFWatcher:
|
|
|
|
def __init__(self):
|
|
|
|
self.observer = Observer()
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
event_handler = PDFHandler()
|
2024-11-27 00:49:20 +08:00
|
|
|
watch_directory = os.path.join(MEDIA_ROOT, "uploads")
|
2024-11-27 00:32:28 +08:00
|
|
|
|
2024-12-04 01:29:30 +08:00
|
|
|
self.observer.schedule(event_handler, watch_directory, recursive=True)
|
2024-11-27 00:32:28 +08:00
|
|
|
self.observer.start()
|
|
|
|
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
time.sleep(5)
|
|
|
|
except:
|
|
|
|
self.observer.stop()
|
|
|
|
|
|
|
|
self.observer.join()
|
|
|
|
|
|
|
|
|
|
|
|
class Command(BaseCommand):
|
2024-11-27 00:35:59 +08:00
|
|
|
help = "Runs a dedicated file watcher service"
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
def handle(self, *args, **options):
|
|
|
|
watcher = PDFWatcher()
|
|
|
|
watcher.run()
|