Add additional scanning metadata and sorting for documents

2025-09-18 13:39:48 +08:00 · 2025-01-09 00:43:55 +08:00 · 2025-01-09 00:43:55 +08:00 · 41507aa550
commit 41507aa550
parent 674a7ec592
5 changed files with 126 additions and 45 deletions
--- a/docmanager_backend/config/management/commands/start_watcher.py
+++ b/docmanager_backend/config/management/commands/start_watcher.py
@ -18,8 +18,9 @@ import logging
 import time
 from ollama import Client
 from pydantic import BaseModel
 from datetime import date, datetime
 from typing import Optional
-import json
+import calendar
 class PDFHandler(FileSystemEventHandler):
@ -87,11 +88,6 @@ class PDFHandler(FileSystemEventHandler):
                # Try to pass image to the Ollama image recognition API first
                try:
                    class DocumentCategory(BaseModel):
                        category: str = "other"
                        sent_from: str = "N/A"
                        explanation: Optional[str] = None
                    client = Client(
                        host=get_secret("OLLAMA_URL"),
                        auth=httpx.BasicAuth(
@ -101,15 +97,54 @@ class PDFHandler(FileSystemEventHandler):
                    encoded_image = base64.b64encode(
                        img_buffer.getvalue()).decode()
                    # First LLM API call to determine category
                    class DocumentSchema(BaseModel):
                        category: str = "other"
                        explanation: Optional[str] = None
                    possible_categories = set((Document.objects.all().values_list(
                        "document_type", flat=True), "Documented Procedures Manual", "Form", "Special Order", "Memorandum"))
                    prompt = f"""
-                        Read the text from the image and provide a category. Return as JSON.
+                        Read the text from the image and provide a document_type.
-                        Possible categories are: {possible_categories}. You are free to create a new one if none are suitable.
+                        Possible document types are: {possible_categories}. You are free to create a new one if none are suitable.
-                        If the document is of type Special Order or Memorandum, provide the sender of the document. Possible senders are Vice President, President, Chancellor.
+                        If the document_type is Special Order or Memorandum, provide the sender of the document under sent_from.
-                        provide N/A.
+
                        Do all of this and return your output in JSON.
                        """
                    response = client.chat(
                        model=get_secret("OLLAMA_MODEL"),
                        messages=[
                            {"role": "user",
                                "content": prompt,
                                "images": [encoded_image]},
                        ],
                        format=DocumentSchema.model_json_schema(),
                        options={
                            "temperature": 0
                        },
                    )
                    result = DocumentSchema.model_validate_json(
                        response.message.content)
                    document_type = result.category
                    # Second LLM API call to determine other details
                    class DocumentSchema(BaseModel):
                        sent_from: str = "N/A"
                        subject: str = "N/A"
                        document_date: Optional[date]
                        explanation: Optional[str] = None
                    prompt = f"""
                        Determine who sent the document. Otherwise, return N/A.
                        Identify the subject or possible title of the document.
                        Return the date of the document if it exists.
                        Do all of this and return your output in JSON.
                        """
                    response = client.chat(
                        model=get_secret("OLLAMA_MODEL"),
@ -118,55 +153,62 @@ class PDFHandler(FileSystemEventHandler):
                                "content": prompt,
                                "images": [encoded_image]},
                        ],
-                        format=DocumentCategory.model_json_schema(),
+                        format=DocumentSchema.model_json_schema(),
                        options={
                            "temperature": 0
                        },
                    )
-
+                    result = DocumentSchema.model_validate_json(
                    DocumentCategory.model_validate_json(
                        response.message.content)
-                    result = json.loads(response.message.content)
+
-                    document_type = result.get("category")
+                    sent_from = result.sent_from
-                    sent_from = result.get("sent_from")
+                    document_date = result.document_date
                    if document_date:
                        document_month = document_date.strftime("%B")
                        document_year = result.document_date.year
                        # Set as none for invalid dates
                        if document_year < 1980:
                            document_month = "no_month"
                            document_year = "no_year"
                    else:
                        document_month = "no_month"
                        document_year = "no_year"
                # If that fails, just use regular OCR read the title as a dirty fix/fallback
                except Exception as e:
                    document_type = "other"
                    sent_from = "N/A"
                    document_month = "no_month"
                    document_year = "no_year"
                    self.logger.warning(f"Error! {e}")
                    self.logger.warning(
-                        "Ollama OCR offload failed. Falling back to default OCR")
+                        "Ollama OCR offload failed. Using defaults for missing values")
                    lines = text.split("\n")
                    for line in lines:
                        if line.strip():
                            document_type = line.strip().lower()
                            break
                    if not document_type:
                        document_type = "other"
                metadata += text
            # Open the file for instance creation
-            DOCUMENT, created = Document.objects.get_or_create(
+            DOCUMENT = Document.objects.filter(
-                name=filename.replace(".pdf", ""),
+                name=filename.replace(".pdf", "")).first()
-                defaults={
+            if not DOCUMENT:
-                    "number_pages": num_pages,
+                DOCUMENT = Document.objects.create(
-                    "ocr_metadata": metadata,
+                    name=filename.replace(".pdf", ""),
-                    "document_type": document_type,
+                    number_pages=num_pages,
-                },
+                    ocr_metadata=metadata,
-            )
+                    document_type=document_type,
                    sent_from=sent_from,
                    document_month=document_month,
                    document_year=document_year
                )
            if created:
                DOCUMENT.file.save(
                    name=filename, content=File(open(file_path, "rb")))
                self.logger.info(
                    f"Document '{filename}' created successfully with type '{
-                        document_type}'. sent_from: {sent_from}"
+                        document_type}'. sent_from: {sent_from}, document_month: {document_month}, document_year: {document_year}"
                )
                DOCUMENT.sent_from = sent_from
                DOCUMENT.save()
            else:
                self.logger.info(f"Document '{filename}' already exists.")
--- a/docmanager_backend/document_requests/serializers.py
+++ b/docmanager_backend/document_requests/serializers.py
@ -200,9 +200,9 @@ class DocumentRequestUpdateSerializer(serializers.ModelSerializer):
        # Send an email on request status update
        try:
            email = RequestUpdateEmail()
-            email.context = {"request_status": instance.status}
+            email.context = {"request_status": validated_data["status"]}
-            if instance.status == "denied":
+            if validated_data["status"] == "denied":
-                email.context = {"remarks": instance.remarks}
+                email.context = {"remarks": validated_data["remarks"]}
            else:
                email.context = {"remarks": "N/A"}
            email.send(to=[instance.requester.email])
--- a/docmanager_backend/documents/admin.py
+++ b/docmanager_backend/documents/admin.py
@ -7,5 +7,7 @@ from .models import Document
@admin.register(Document)
 class DocumentAdmin(ModelAdmin):
    model = Document
-    search_fields = ["id", "name", "document_type"]
+    search_fields = ["id", "name", "subject", "sent_from", "document_year",
-    list_display = ["id", "name", "document_type", "date_uploaded"]
+                     "document_month", "document_type"]
    list_display = ["id", "name", "subject", "sent_from", "document_year",
                    "document_month", "document_type", "date_uploaded"]
--- a/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py
+++ b/docmanager_backend/documents/migrations/0005_document_document_month_document_document_year_and_more.py
@ -0,0 +1,28 @@
 # Generated by Django 5.1.3 on 2025-01-08 14:41
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ("documents", "0004_rename_memorandum_from_document_sent_from"),
    ]
    operations = [
        migrations.AddField(
            model_name="document",
            name="document_month",
            field=models.CharField(blank=True, max_length=128, null=True),
        ),
        migrations.AddField(
            model_name="document",
            name="document_year",
            field=models.CharField(blank=True, max_length=128, null=True),
        ),
        migrations.AddField(
            model_name="document",
            name="subject",
            field=models.CharField(blank=True, max_length=128, null=True),
        ),
    ]
--- a/docmanager_backend/documents/models.py
+++ b/docmanager_backend/documents/models.py
@ -12,12 +12,21 @@ class Document(models.Model):
    sent_from = models.CharField(
        max_length=128, null=True, blank=True
    )
    document_month = models.CharField(
        max_length=128, null=True, blank=True
    )
    document_year = models.CharField(
        max_length=128, null=True, blank=True
    )
    subject = models.CharField(
        max_length=128, null=True, blank=True
    )
    number_pages = models.IntegerField(null=False, blank=False)
    ocr_metadata = models.TextField(null=True, blank=True)
    def upload_to(instance, filename):
        _, extension = filename.rsplit(".", 1)
-        return "documents/%s_%s.%s" % (now(), str(uuid.uuid4()), extension)
+        return f"documents/{instance.document_type}/{instance.document_year}/{str(uuid.uuid4())}.{extension}"
    file = models.FileField(upload_to=upload_to)