From b76adb3601f743b831c99f57e45e02223a2e36db Mon Sep 17 00:00:00 2001 From: Keannu Bernasol Date: Sun, 24 Nov 2024 14:08:27 +0800 Subject: [PATCH] Add OCR metadata field processing for documents --- docmanager_backend/documents/apps.py | 4 +++ ...t_metadata_alter_document_document_type.py | 25 ++++++++++++++++++ ...document_metadata_document_ocr_metadata.py | 22 ++++++++++++++++ docmanager_backend/documents/models.py | 11 ++++---- docmanager_backend/documents/serializers.py | 7 +++-- docmanager_backend/documents/signals.py | 26 +++++++++++++++++++ 6 files changed, 87 insertions(+), 8 deletions(-) create mode 100644 docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py create mode 100644 docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py create mode 100644 docmanager_backend/documents/signals.py diff --git a/docmanager_backend/documents/apps.py b/docmanager_backend/documents/apps.py index 37ce729..e4a5400 100644 --- a/docmanager_backend/documents/apps.py +++ b/docmanager_backend/documents/apps.py @@ -4,3 +4,7 @@ from django.apps import AppConfig class DocumentsConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "documents" + + def ready(self) -> None: + import documents.signals + return super().ready() diff --git a/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py b/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py new file mode 100644 index 0000000..fb63337 --- /dev/null +++ b/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py @@ -0,0 +1,25 @@ +# Generated by Django 5.1.3 on 2024-11-24 05:17 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="document", + name="metadata", + field=models.TextField(null=True), + ), + migrations.AlterField( + model_name="document", + name="document_type", + field=models.CharField( + choices=[("memorandum", "Memorandum"), ("hoa", "HOA")], max_length=32 + ), + ), + ] diff --git a/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py b/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py new file mode 100644 index 0000000..42db5c8 --- /dev/null +++ b/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py @@ -0,0 +1,22 @@ +# Generated by Django 5.1.3 on 2024-11-24 06:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "0002_document_metadata_alter_document_document_type"), + ] + + operations = [ + migrations.RemoveField( + model_name="document", + name="metadata", + ), + migrations.AddField( + model_name="document", + name="ocr_metadata", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py index be7d83d..d7c750b 100644 --- a/docmanager_backend/documents/models.py +++ b/docmanager_backend/documents/models.py @@ -1,3 +1,4 @@ + from django.db import models from django.utils.timezone import now import uuid @@ -7,17 +8,15 @@ class Document(models.Model): name = models.CharField(max_length=100) DOCUMENT_TYPE_CHOICES = ( - ("pdf", "PDF"), - ("image", "Image"), - ("video", "Video"), - ("doc", "Word Document"), - ("excel", "Excel Document"), - ("ppt", "Powerpoint Document"), + ("memorandum", "Memorandum"), + ("hoa", "HOA"), + # TODO: Update this list on types of documents ) document_type = models.CharField( max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False ) number_pages = models.IntegerField(null=False, blank=False) + ocr_metadata = models.TextField(null=True, blank=True) def upload_to(instance, filename): _, extension = filename.split(".") diff --git a/docmanager_backend/documents/serializers.py b/docmanager_backend/documents/serializers.py index 80a3594..5fc2247 100644 --- a/docmanager_backend/documents/serializers.py +++ b/docmanager_backend/documents/serializers.py @@ -36,12 +36,13 @@ class DocumentSerializer(serializers.ModelSerializer): class Meta: model = Document fields = ["id", "name", "document_type", - "number_pages", "date_uploaded"] + "number_pages", "ocr_metadata", "date_uploaded"] read_only_fields = [ "id", "name", "document_type", "number_pages", + "ocr_metadata", "date_uploaded", ] @@ -59,15 +60,17 @@ class DocumentFileSerializer(serializers.ModelSerializer): "id", "name", "document_type", - "file", "number_pages", + "ocr_metadata", "date_uploaded", + "file", ] read_only_fields = [ "id", "name", "document_type", "number_pages", + "ocr_metadata", "date_uploaded", "file", ] diff --git a/docmanager_backend/documents/signals.py b/docmanager_backend/documents/signals.py new file mode 100644 index 0000000..0b7621c --- /dev/null +++ b/docmanager_backend/documents/signals.py @@ -0,0 +1,26 @@ +from django.db.models.signals import post_save +from django.dispatch import receiver +from config.settings import MEDIA_ROOT +import os +import fitz +import pytesseract +from PIL import Image +from .models import Document + + +@receiver(post_save, sender=Document) +def domain_post_save(sender, instance, **kwargs): + if not instance.ocr_metadata: + metadata = "" + with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc: + mat = fitz.Matrix(1.2, 1.2) + for page in doc: + pix = page.get_pixmap(matrix=mat) + output = f'{page.number}.jpg' + pix.save(output) + res = str(pytesseract.image_to_string(Image.open(output))) + os.remove(output) + metadata += res + + instance.ocr_metadata = metadata + instance.save()