diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py index 401f5e3..bd13376 100644 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -18,8 +18,9 @@ import logging import time from ollama import Client from pydantic import BaseModel +from datetime import date, datetime from typing import Optional -import json +import calendar class PDFHandler(FileSystemEventHandler): @@ -87,11 +88,6 @@ class PDFHandler(FileSystemEventHandler): # Try to pass image to the Ollama image recognition API first try: - class DocumentCategory(BaseModel): - category: str = "other" - sent_from: str = "N/A" - explanation: Optional[str] = None - client = Client( host=get_secret("OLLAMA_URL"), auth=httpx.BasicAuth( @@ -101,15 +97,54 @@ class PDFHandler(FileSystemEventHandler): encoded_image = base64.b64encode( img_buffer.getvalue()).decode() + # First LLM API call to determine category + class DocumentSchema(BaseModel): + category: str = "other" + explanation: Optional[str] = None + possible_categories = set((Document.objects.all().values_list( "document_type", flat=True), "Documented Procedures Manual", "Form", "Special Order", "Memorandum")) prompt = f""" - Read the text from the image and provide a category. Return as JSON. + Read the text from the image and provide a document_type. - Possible categories are: {possible_categories}. You are free to create a new one if none are suitable. + Possible document types are: {possible_categories}. You are free to create a new one if none are suitable. - If the document is of type Special Order or Memorandum, provide the sender of the document. Possible senders are Vice President, President, Chancellor. - provide N/A. + If the document_type is Special Order or Memorandum, provide the sender of the document under sent_from. + + Do all of this and return your output in JSON. + """ + + response = client.chat( + model=get_secret("OLLAMA_MODEL"), + messages=[ + {"role": "user", + "content": prompt, + "images": [encoded_image]}, + ], + format=DocumentSchema.model_json_schema(), + options={ + "temperature": 0 + }, + ) + result = DocumentSchema.model_validate_json( + response.message.content) + document_type = result.category + + # Second LLM API call to determine other details + class DocumentSchema(BaseModel): + sent_from: str = "N/A" + subject: str = "N/A" + document_date: Optional[date] + explanation: Optional[str] = None + + prompt = f""" + Determine who sent the document. Otherwise, return N/A. + + Identify the subject or possible title of the document. + + Return the date of the document if it exists. + + Do all of this and return your output in JSON. """ response = client.chat( model=get_secret("OLLAMA_MODEL"), @@ -118,55 +153,62 @@ class PDFHandler(FileSystemEventHandler): "content": prompt, "images": [encoded_image]}, ], - format=DocumentCategory.model_json_schema(), + format=DocumentSchema.model_json_schema(), options={ "temperature": 0 }, - ) - - DocumentCategory.model_validate_json( + result = DocumentSchema.model_validate_json( response.message.content) - result = json.loads(response.message.content) - document_type = result.get("category") - sent_from = result.get("sent_from") + + sent_from = result.sent_from + document_date = result.document_date + + if document_date: + document_month = document_date.strftime("%B") + document_year = result.document_date.year + # Set as none for invalid dates + if document_year < 1980: + document_month = "no_month" + document_year = "no_year" + else: + document_month = "no_month" + document_year = "no_year" # If that fails, just use regular OCR read the title as a dirty fix/fallback except Exception as e: + document_type = "other" + sent_from = "N/A" + document_month = "no_month" + document_year = "no_year" + self.logger.warning(f"Error! {e}") self.logger.warning( - "Ollama OCR offload failed. Falling back to default OCR") - lines = text.split("\n") - - for line in lines: - if line.strip(): - document_type = line.strip().lower() - break - - if not document_type: - document_type = "other" + "Ollama OCR offload failed. Using defaults for missing values") metadata += text # Open the file for instance creation - DOCUMENT, created = Document.objects.get_or_create( - name=filename.replace(".pdf", ""), - defaults={ - "number_pages": num_pages, - "ocr_metadata": metadata, - "document_type": document_type, - }, - ) + DOCUMENT = Document.objects.filter( + name=filename.replace(".pdf", "")).first() + if not DOCUMENT: + DOCUMENT = Document.objects.create( + name=filename.replace(".pdf", ""), + number_pages=num_pages, + ocr_metadata=metadata, + document_type=document_type, + sent_from=sent_from, + document_month=document_month, + document_year=document_year + ) - if created: DOCUMENT.file.save( name=filename, content=File(open(file_path, "rb"))) + self.logger.info( f"Document '{filename}' created successfully with type '{ - document_type}'. sent_from: {sent_from}" + document_type}'. sent_from: {sent_from}, document_month: {document_month}, document_year: {document_year}" ) - DOCUMENT.sent_from = sent_from - DOCUMENT.save() else: self.logger.info(f"Document '{filename}' already exists.") diff --git a/docmanager_backend/document_requests/serializers.py b/docmanager_backend/document_requests/serializers.py index 7ab83df..332a1c4 100644 --- a/docmanager_backend/document_requests/serializers.py +++ b/docmanager_backend/document_requests/serializers.py @@ -200,9 +200,9 @@ class DocumentRequestUpdateSerializer(serializers.ModelSerializer): # Send an email on request status update try: email = RequestUpdateEmail() - email.context = {"request_status": instance.status} - if instance.status == "denied": - email.context = {"remarks": instance.remarks} + email.context = {"request_status": validated_data["status"]} + if validated_data["status"] == "denied": + email.context = {"remarks": validated_data["remarks"]} else: email.context = {"remarks": "N/A"} email.send(to=[instance.requester.email]) diff --git a/docmanager_backend/documents/admin.py b/docmanager_backend/documents/admin.py index 356fe3e..e569489 100644 --- a/docmanager_backend/documents/admin.py +++ b/docmanager_backend/documents/admin.py @@ -7,5 +7,7 @@ from .models import Document @admin.register(Document) class DocumentAdmin(ModelAdmin): model = Document - search_fields = ["id", "name", "document_type"] - list_display = ["id", "name", "document_type", "date_uploaded"] + search_fields = ["id", "name", "subject", "sent_from", "document_year", + "document_month", "document_type"] + list_display = ["id", "name", "subject", "sent_from", "document_year", + "document_month", "document_type", "date_uploaded"] diff --git a/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py b/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py new file mode 100644 index 0000000..78ab6f4 --- /dev/null +++ b/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 5.1.3 on 2025-01-08 14:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "0004_rename_memorandum_from_document_sent_from"), + ] + + operations = [ + migrations.AddField( + model_name="document", + name="document_month", + field=models.CharField(blank=True, max_length=128, null=True), + ), + migrations.AddField( + model_name="document", + name="document_year", + field=models.CharField(blank=True, max_length=128, null=True), + ), + migrations.AddField( + model_name="document", + name="subject", + field=models.CharField(blank=True, max_length=128, null=True), + ), + ] diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py index 59464cf..a33a839 100644 --- a/docmanager_backend/documents/models.py +++ b/docmanager_backend/documents/models.py @@ -12,12 +12,21 @@ class Document(models.Model): sent_from = models.CharField( max_length=128, null=True, blank=True ) + document_month = models.CharField( + max_length=128, null=True, blank=True + ) + document_year = models.CharField( + max_length=128, null=True, blank=True + ) + subject = models.CharField( + max_length=128, null=True, blank=True + ) number_pages = models.IntegerField(null=False, blank=False) ocr_metadata = models.TextField(null=True, blank=True) def upload_to(instance, filename): _, extension = filename.rsplit(".", 1) - return "documents/%s_%s.%s" % (now(), str(uuid.uuid4()), extension) + return f"documents/{instance.document_type}/{instance.document_year}/{str(uuid.uuid4())}.{extension}" file = models.FileField(upload_to=upload_to)