From 674a7ec5923188bd1f3c3e8e3904215607715de3 Mon Sep 17 00:00:00 2001
From: Keannu Bernasol
Date: Wed, 8 Jan 2025 13:38:39 +0800
Subject: [PATCH] Implement sender LLM OCR and request remarks
---
.../management/commands/start_watcher.py | 125 +++++++++---------
.../0003_documentrequest_denied_remarks.py | 18 +++
..._denied_remarks_documentrequest_remarks.py | 18 +++
.../document_requests/models.py | 2 +
.../document_requests/serializers.py | 23 +++-
.../0003_document_memorandum_from.py | 18 +++
...name_memorandum_from_document_sent_from.py | 18 +++
docmanager_backend/documents/models.py | 3 +
docmanager_backend/documents/serializers.py | 4 +
docmanager_backend/emails/templates.py | 1 +
.../emails/templates/request_approved.html | 6 +
11 files changed, 174 insertions(+), 62 deletions(-)
create mode 100644 docmanager_backend/document_requests/migrations/0003_documentrequest_denied_remarks.py
create mode 100644 docmanager_backend/document_requests/migrations/0004_rename_denied_remarks_documentrequest_remarks.py
create mode 100644 docmanager_backend/documents/migrations/0003_document_memorandum_from.py
create mode 100644 docmanager_backend/documents/migrations/0004_rename_memorandum_from_document_sent_from.py
diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py
index a5dc38f..401f5e3 100644
--- a/docmanager_backend/config/management/commands/start_watcher.py
+++ b/docmanager_backend/config/management/commands/start_watcher.py
@@ -69,79 +69,84 @@ class PDFHandler(FileSystemEventHandler):
with fitz.open(file_path) as doc:
num_pages = len(doc)
- for page_num in range(num_pages):
- page = doc[page_num]
- pix = page.get_pixmap(matrix=(1.2, 1.2))
+ # Perform OCR only on the first page
+ page = doc[0]
+ pix = page.get_pixmap(matrix=(1.2, 1.2))
- # Convert pixmap to bytes
- img_bytes = pix.tobytes()
+ # Convert pixmap to bytes
+ img_bytes = pix.tobytes()
- # Create a BytesIO object
- img_buffer = BytesIO(img_bytes)
+ # Create a BytesIO object
+ img_buffer = BytesIO(img_bytes)
- # Create a PIL Image object from the bytes
- img = Image.open(img_buffer)
+ # Create a PIL Image object from the bytes
+ img = Image.open(img_buffer)
- # Perform OCR
- text = pytesseract.image_to_string(img).strip()
+ # Perform OCR
+ text = pytesseract.image_to_string(img).strip()
- # Try to pass image to the Ollama image recognition API first
- try:
- class DocumentCategory(BaseModel):
- category: str = "other"
- explanation: Optional[str] = None
+ # Try to pass image to the Ollama image recognition API first
+ try:
+ class DocumentCategory(BaseModel):
+ category: str = "other"
+ sent_from: str = "N/A"
+ explanation: Optional[str] = None
- client = Client(
- host=get_secret("OLLAMA_URL"),
- auth=httpx.BasicAuth(
- username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None,
- )
+ client = Client(
+ host=get_secret("OLLAMA_URL"),
+ auth=httpx.BasicAuth(
+ username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None,
+ )
- encoded_image = base64.b64encode(
- img_buffer.getvalue()).decode()
+ encoded_image = base64.b64encode(
+ img_buffer.getvalue()).decode()
- possible_categories = set((Document.objects.all().values_list(
- "document_type", flat=True), "Documented Procedures Manual", "Form", "Special Order"))
- prompt = f"""
- Read the text from the image and provide a category. Return as JSON.
+ possible_categories = set((Document.objects.all().values_list(
+ "document_type", flat=True), "Documented Procedures Manual", "Form", "Special Order", "Memorandum"))
+ prompt = f"""
+ Read the text from the image and provide a category. Return as JSON.
- Possible categories are: {possible_categories}. You are free to create a new one if none are suitable.
- """
- response = client.chat(
- model=get_secret("OLLAMA_MODEL"),
- messages=[
- {"role": "user",
- "content": prompt,
- "images": [encoded_image]},
- ],
- format=DocumentCategory.model_json_schema(),
- options={
- "temperature": 0
- },
+ Possible categories are: {possible_categories}. You are free to create a new one if none are suitable.
- )
+ If the document is of type Special Order or Memorandum, provide the sender of the document. Possible senders are Vice President, President, Chancellor.
+ provide N/A.
+ """
+ response = client.chat(
+ model=get_secret("OLLAMA_MODEL"),
+ messages=[
+ {"role": "user",
+ "content": prompt,
+ "images": [encoded_image]},
+ ],
+ format=DocumentCategory.model_json_schema(),
+ options={
+ "temperature": 0
+ },
- DocumentCategory.model_validate_json(
- response.message.content)
- result = json.loads(response.message.content)
- document_type = result.get("category")
+ )
- # If that fails, just use regular OCR read the title as a dirty fix/fallback
- except Exception as e:
- self.logger.warning(f"Error! {e}")
- self.logger.warning(
- "Ollama OCR offload failed. Falling back to default OCR")
- lines = text.split("\n")
+ DocumentCategory.model_validate_json(
+ response.message.content)
+ result = json.loads(response.message.content)
+ document_type = result.get("category")
+ sent_from = result.get("sent_from")
- for line in lines:
- if line.strip():
- document_type = line.strip().lower()
- break
+ # If that fails, just use regular OCR read the title as a dirty fix/fallback
+ except Exception as e:
+ self.logger.warning(f"Error! {e}")
+ self.logger.warning(
+ "Ollama OCR offload failed. Falling back to default OCR")
+ lines = text.split("\n")
- if not document_type:
- document_type = "other"
+ for line in lines:
+ if line.strip():
+ document_type = line.strip().lower()
+ break
- metadata += text
+ if not document_type:
+ document_type = "other"
+
+ metadata += text
# Open the file for instance creation
DOCUMENT, created = Document.objects.get_or_create(
@@ -158,8 +163,10 @@ class PDFHandler(FileSystemEventHandler):
name=filename, content=File(open(file_path, "rb")))
self.logger.info(
f"Document '{filename}' created successfully with type '{
- document_type}'."
+ document_type}'. sent_from: {sent_from}"
)
+ DOCUMENT.sent_from = sent_from
+ DOCUMENT.save()
else:
self.logger.info(f"Document '{filename}' already exists.")
diff --git a/docmanager_backend/document_requests/migrations/0003_documentrequest_denied_remarks.py b/docmanager_backend/document_requests/migrations/0003_documentrequest_denied_remarks.py
new file mode 100644
index 0000000..675b1ec
--- /dev/null
+++ b/docmanager_backend/document_requests/migrations/0003_documentrequest_denied_remarks.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.1.3 on 2025-01-08 04:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("document_requests", "0002_documentrequest_questionnaire"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="documentrequest",
+ name="denied_remarks",
+ field=models.TextField(blank=True, max_length=512, null=True),
+ ),
+ ]
diff --git a/docmanager_backend/document_requests/migrations/0004_rename_denied_remarks_documentrequest_remarks.py b/docmanager_backend/document_requests/migrations/0004_rename_denied_remarks_documentrequest_remarks.py
new file mode 100644
index 0000000..3789bb2
--- /dev/null
+++ b/docmanager_backend/document_requests/migrations/0004_rename_denied_remarks_documentrequest_remarks.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.1.3 on 2025-01-08 04:51
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("document_requests", "0003_documentrequest_denied_remarks"),
+ ]
+
+ operations = [
+ migrations.RenameField(
+ model_name="documentrequest",
+ old_name="denied_remarks",
+ new_name="remarks",
+ ),
+ ]
diff --git a/docmanager_backend/document_requests/models.py b/docmanager_backend/document_requests/models.py
index 1efc716..7c40f44 100644
--- a/docmanager_backend/document_requests/models.py
+++ b/docmanager_backend/document_requests/models.py
@@ -27,6 +27,8 @@ class DocumentRequest(models.Model):
("denied", "Denied"),
)
+ remarks = models.TextField(max_length=512, blank=True, null=True)
+
status = models.CharField(
max_length=32, choices=STATUS_CHOICES, default="pending")
diff --git a/docmanager_backend/document_requests/serializers.py b/docmanager_backend/document_requests/serializers.py
index 23d065c..7ab83df 100644
--- a/docmanager_backend/document_requests/serializers.py
+++ b/docmanager_backend/document_requests/serializers.py
@@ -101,6 +101,7 @@ class DocumentRequestSerializer(serializers.ModelSerializer):
"purpose",
"date_requested",
"documents",
+ "remarks",
"status",
]
read_only_fields = [
@@ -112,6 +113,7 @@ class DocumentRequestSerializer(serializers.ModelSerializer):
"purpose",
"date_requested",
"documents",
+ "remarks,"
"status",
]
@@ -146,6 +148,7 @@ class FullDocumentRequestSerializer(serializers.ModelSerializer):
"purpose",
"date_requested",
"documents",
+ "remarks",
"status",
]
read_only_fields = [
@@ -167,27 +170,41 @@ class DocumentRequestUpdateSerializer(serializers.ModelSerializer):
class Meta:
model = DocumentRequest
- fields = ["id", "status"]
- read_only_fields = ["id", "status"]
+ fields = ["id", "status", "remarks"]
+ read_only_fields = ["id"]
def update(self, instance, validated_data):
+ print(validated_data)
if instance.status == "denied" or instance.status == "approved":
raise serializers.ValidationError(
{
"error": "Already approved/denied requests cannot be updated. You should instead create a new request and approve it from there"
}
)
+ elif "status" not in validated_data:
+ raise serializers.ValidationError(
+ {
+ "error": "No status value update provided"
+ }
+ )
elif validated_data["status"] == instance.status:
raise serializers.ValidationError(
{"error": "Request form status provided is the same as current status"}
)
-
+ elif validated_data["status"] == "denied" and "remarks" not in validated_data:
+ raise serializers.ValidationError(
+ {"error": "Request denial requires remarks"}
+ )
representation = super().update(instance, validated_data)
# Send an email on request status update
try:
email = RequestUpdateEmail()
email.context = {"request_status": instance.status}
+ if instance.status == "denied":
+ email.context = {"remarks": instance.remarks}
+ else:
+ email.context = {"remarks": "N/A"}
email.send(to=[instance.requester.email])
except:
# Silence out errors if email sending fails
diff --git a/docmanager_backend/documents/migrations/0003_document_memorandum_from.py b/docmanager_backend/documents/migrations/0003_document_memorandum_from.py
new file mode 100644
index 0000000..29714a9
--- /dev/null
+++ b/docmanager_backend/documents/migrations/0003_document_memorandum_from.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.1.3 on 2025-01-08 04:39
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("documents", "0002_alter_document_document_type"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="document",
+ name="memorandum_from",
+ field=models.CharField(blank=True, max_length=128, null=True),
+ ),
+ ]
diff --git a/docmanager_backend/documents/migrations/0004_rename_memorandum_from_document_sent_from.py b/docmanager_backend/documents/migrations/0004_rename_memorandum_from_document_sent_from.py
new file mode 100644
index 0000000..c1aee0c
--- /dev/null
+++ b/docmanager_backend/documents/migrations/0004_rename_memorandum_from_document_sent_from.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.1.3 on 2025-01-08 04:44
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("documents", "0003_document_memorandum_from"),
+ ]
+
+ operations = [
+ migrations.RenameField(
+ model_name="document",
+ old_name="memorandum_from",
+ new_name="sent_from",
+ ),
+ ]
diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py
index 484a115..59464cf 100644
--- a/docmanager_backend/documents/models.py
+++ b/docmanager_backend/documents/models.py
@@ -9,6 +9,9 @@ class Document(models.Model):
document_type = models.CharField(
max_length=128, null=False, blank=False
)
+ sent_from = models.CharField(
+ max_length=128, null=True, blank=True
+ )
number_pages = models.IntegerField(null=False, blank=False)
ocr_metadata = models.TextField(null=True, blank=True)
diff --git a/docmanager_backend/documents/serializers.py b/docmanager_backend/documents/serializers.py
index 6ef47eb..9e62e56 100644
--- a/docmanager_backend/documents/serializers.py
+++ b/docmanager_backend/documents/serializers.py
@@ -53,6 +53,7 @@ class DocumentSerializer(serializers.ModelSerializer):
"document_type",
"number_pages",
"ocr_metadata",
+ "sent_from",
"date_uploaded",
]
read_only_fields = [
@@ -61,6 +62,7 @@ class DocumentSerializer(serializers.ModelSerializer):
"document_type",
"number_pages",
"ocr_metadata",
+ "sent_from",
"date_uploaded",
]
@@ -81,6 +83,7 @@ class DocumentFileSerializer(serializers.ModelSerializer):
"number_pages",
"ocr_metadata",
"date_uploaded",
+ "sent_from",
"file",
]
read_only_fields = [
@@ -90,5 +93,6 @@ class DocumentFileSerializer(serializers.ModelSerializer):
"number_pages",
"ocr_metadata",
"date_uploaded",
+ "sent_from",
"file",
]
diff --git a/docmanager_backend/emails/templates.py b/docmanager_backend/emails/templates.py
index acae072..96f4699 100644
--- a/docmanager_backend/emails/templates.py
+++ b/docmanager_backend/emails/templates.py
@@ -8,6 +8,7 @@ class RequestUpdateEmail(email.BaseEmailMessage):
def get_context_data(self):
context = super().get_context_data()
context["request_status"] = context.get("request_status")
+ context["remarks"] = context.get("remarks")
context["url"] = FRONTEND_URL
context.update(self.context)
return context
diff --git a/docmanager_backend/emails/templates/request_approved.html b/docmanager_backend/emails/templates/request_approved.html
index 2b74432..9244db5 100644
--- a/docmanager_backend/emails/templates/request_approved.html
+++ b/docmanager_backend/emails/templates/request_approved.html
@@ -7,6 +7,8 @@
{% block text_body %}
{% blocktrans %}You're receiving this email because your document request has been {{ request_status }}.{% endblocktrans %}
+ {% blocktrans %}Remarks: {{ remarks }}{% endblocktrans %}
+
{% trans 'Please visit the site to check your request:' %}
{{ url|safe }}
@@ -18,6 +20,10 @@
{% blocktrans %}You're receiving this email because your document request has been {{ request_status }}.{% endblocktrans %}
+
+ {% blocktrans %}Remarks {{ remarks }}{% endblocktrans %}
+
+
{% trans 'Please visit the site to check your request:' %}