Add additional scanning metadata and sorting for documents

2025-08-29 20:13:38 +08:00 · 2025-01-09 00:43:55 +08:00 · 2025-01-09 00:43:55 +08:00 · 41507aa550
commit 41507aa550
parent 674a7ec592
5 changed files with 126 additions and 45 deletions
--- a/docmanager_backend/config/management/commands/start_watcher.py
+++ b/docmanager_backend/config/management/commands/start_watcher.py
@ -18,8 +18,9 @@ import logging
 import time
 from ollama import Client
 from pydantic import BaseModel
+from datetime import date, datetime
 from typing import Optional
-import json
+import calendar


 class PDFHandler(FileSystemEventHandler):
@ -87,11 +88,6 @@ class PDFHandler(FileSystemEventHandler):

                # Try to pass image to the Ollama image recognition API first
                try:
-                    class DocumentCategory(BaseModel):
-                        category: str = "other"
-                        sent_from: str = "N/A"
-                        explanation: Optional[str] = None
-
                    client = Client(
                        host=get_secret("OLLAMA_URL"),
                        auth=httpx.BasicAuth(
@ -101,15 +97,54 @@ class PDFHandler(FileSystemEventHandler):
                    encoded_image = base64.b64encode(
                        img_buffer.getvalue()).decode()

+                    # First LLM API call to determine category
+                    class DocumentSchema(BaseModel):
+                        category: str = "other"
+                        explanation: Optional[str] = None
+
                    possible_categories = set((Document.objects.all().values_list(
                        "document_type", flat=True), "Documented Procedures Manual", "Form", "Special Order", "Memorandum"))
                    prompt = f"""
-                        Read the text from the image and provide a category. Return as JSON.
+                        Read the text from the image and provide a document_type.

-                        Possible categories are: {possible_categories}. You are free to create a new one if none are suitable.
+                        Possible document types are: {possible_categories}. You are free to create a new one if none are suitable.

-                        If the document is of type Special Order or Memorandum, provide the sender of the document. Possible senders are Vice President, President, Chancellor.
-                        provide N/A.
+                        If the document_type is Special Order or Memorandum, provide the sender of the document under sent_from.
+
+                        Do all of this and return your output in JSON.
+                        """
+
+                    response = client.chat(
+                        model=get_secret("OLLAMA_MODEL"),
+                        messages=[
+                            {"role": "user",
+                                "content": prompt,
+                                "images": [encoded_image]},
+                        ],
+                        format=DocumentSchema.model_json_schema(),
+                        options={
+                            "temperature": 0
+                        },
+                    )
+                    result = DocumentSchema.model_validate_json(
+                        response.message.content)
+                    document_type = result.category
+
+                    # Second LLM API call to determine other details
+                    class DocumentSchema(BaseModel):
+                        sent_from: str = "N/A"
+                        subject: str = "N/A"
+                        document_date: Optional[date]
+                        explanation: Optional[str] = None
+
+                    prompt = f"""
+                        Determine who sent the document. Otherwise, return N/A.
+
+                        Identify the subject or possible title of the document.
+
+                        Return the date of the document if it exists.
+
+                        Do all of this and return your output in JSON.
                        """
                    response = client.chat(
                        model=get_secret("OLLAMA_MODEL"),
@ -118,55 +153,62 @@ class PDFHandler(FileSystemEventHandler):
                                "content": prompt,
                                "images": [encoded_image]},
                        ],
-                        format=DocumentCategory.model_json_schema(),
+                        format=DocumentSchema.model_json_schema(),
                        options={
                            "temperature": 0
                        },
-
                    )
-
-                    DocumentCategory.model_validate_json(
+                    result = DocumentSchema.model_validate_json(
                        response.message.content)
-                    result = json.loads(response.message.content)
-                    document_type = result.get("category")
-                    sent_from = result.get("sent_from")
+
+                    sent_from = result.sent_from
+                    document_date = result.document_date
+
+                    if document_date:
+                        document_month = document_date.strftime("%B")
+                        document_year = result.document_date.year
+                        # Set as none for invalid dates
+                        if document_year < 1980:
+                            document_month = "no_month"
+                            document_year = "no_year"
+                    else:
+                        document_month = "no_month"
+                        document_year = "no_year"

                # If that fails, just use regular OCR read the title as a dirty fix/fallback
                except Exception as e:
+                    document_type = "other"
+                    sent_from = "N/A"
+                    document_month = "no_month"
+                    document_year = "no_year"
+
                    self.logger.warning(f"Error! {e}")
                    self.logger.warning(
-                        "Ollama OCR offload failed. Falling back to default OCR")
-                    lines = text.split("\n")
-
-                    for line in lines:
-                        if line.strip():
-                            document_type = line.strip().lower()
-                            break
-
-                    if not document_type:
-                        document_type = "other"
+                        "Ollama OCR offload failed. Using defaults for missing values")

                metadata += text

            # Open the file for instance creation
-            DOCUMENT, created = Document.objects.get_or_create(
-                name=filename.replace(".pdf", ""),
-                defaults={
-                    "number_pages": num_pages,
-                    "ocr_metadata": metadata,
-                    "document_type": document_type,
-                },
-            )
+            DOCUMENT = Document.objects.filter(
+                name=filename.replace(".pdf", "")).first()
+            if not DOCUMENT:
+                DOCUMENT = Document.objects.create(
+                    name=filename.replace(".pdf", ""),
+                    number_pages=num_pages,
+                    ocr_metadata=metadata,
+                    document_type=document_type,
+                    sent_from=sent_from,
+                    document_month=document_month,
+                    document_year=document_year
+                )

-            if created:
                DOCUMENT.file.save(
                    name=filename, content=File(open(file_path, "rb")))
+
                self.logger.info(
                    f"Document '{filename}' created successfully with type '{
-                        document_type}'. sent_from: {sent_from}"
+                        document_type}'. sent_from: {sent_from}, document_month: {document_month}, document_year: {document_year}"
                )
-                DOCUMENT.sent_from = sent_from
-                DOCUMENT.save()

            else:
                self.logger.info(f"Document '{filename}' already exists.")
--- a/docmanager_backend/document_requests/serializers.py
+++ b/docmanager_backend/document_requests/serializers.py
@ -200,9 +200,9 @@ class DocumentRequestUpdateSerializer(serializers.ModelSerializer):
        # Send an email on request status update
        try:
            email = RequestUpdateEmail()
-            email.context = {"request_status": instance.status}
-            if instance.status == "denied":
-                email.context = {"remarks": instance.remarks}
+            email.context = {"request_status": validated_data["status"]}
+            if validated_data["status"] == "denied":
+                email.context = {"remarks": validated_data["remarks"]}
            else:
                email.context = {"remarks": "N/A"}
            email.send(to=[instance.requester.email])
--- a/docmanager_backend/documents/admin.py
+++ b/docmanager_backend/documents/admin.py
@ -7,5 +7,7 @@ from .models import Document
@admin.register(Document)
 class DocumentAdmin(ModelAdmin):
    model = Document
-    search_fields = ["id", "name", "document_type"]
-    list_display = ["id", "name", "document_type", "date_uploaded"]
+    search_fields = ["id", "name", "subject", "sent_from", "document_year",
+                     "document_month", "document_type"]
+    list_display = ["id", "name", "subject", "sent_from", "document_year",
+                    "document_month", "document_type", "date_uploaded"]
--- a/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py
+++ b/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py
@ -0,0 +1,28 @@
+# Generated by Django 5.1.3 on 2025-01-08 14:41
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("documents", "0004_rename_memorandum_from_document_sent_from"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="document",
+            name="document_month",
+            field=models.CharField(blank=True, max_length=128, null=True),
+        ),
+        migrations.AddField(
+            model_name="document",
+            name="document_year",
+            field=models.CharField(blank=True, max_length=128, null=True),
+        ),
+        migrations.AddField(
+            model_name="document",
+            name="subject",
+            field=models.CharField(blank=True, max_length=128, null=True),
+        ),
+    ]
--- a/docmanager_backend/documents/models.py
+++ b/docmanager_backend/documents/models.py
@ -12,12 +12,21 @@ class Document(models.Model):
    sent_from = models.CharField(
        max_length=128, null=True, blank=True
    )
+    document_month = models.CharField(
+        max_length=128, null=True, blank=True
+    )
+    document_year = models.CharField(
+        max_length=128, null=True, blank=True
+    )
+    subject = models.CharField(
+        max_length=128, null=True, blank=True
+    )
    number_pages = models.IntegerField(null=False, blank=False)
    ocr_metadata = models.TextField(null=True, blank=True)

    def upload_to(instance, filename):
        _, extension = filename.rsplit(".", 1)
-        return "documents/%s_%s.%s" % (now(), str(uuid.uuid4()), extension)
+        return f"documents/{instance.document_type}/{instance.document_year}/{str(uuid.uuid4())}.{extension}"

    file = models.FileField(upload_to=upload_to)