Add directory watcher service

2025-08-29 20:13:38 +08:00 · 2024-11-27 00:32:28 +08:00 · 2024-11-27 00:32:28 +08:00 · 5bc170f519
commit 5bc170f519
parent 957272cd07
6 changed files with 164 additions and 8 deletions
--- a/docmanager_backend/documents/migrations/0004_alter_document_document_type.py
+++ b/docmanager_backend/documents/migrations/0004_alter_document_document_type.py
@ -0,0 +1,26 @@
+# Generated by Django 5.1.3 on 2024-11-26 15:12
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("documents", "0003_remove_document_metadata_document_ocr_metadata"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="document",
+            name="document_type",
+            field=models.CharField(
+                choices=[
+                    ("memorandum", "Memorandum"),
+                    ("hoa", "HOA"),
+                    ("documented procedures manual", "Documented Procedures Manual"),
+                    ("other", "Other"),
+                ],
+                max_length=32,
+            ),
+        ),
+    ]
--- a/docmanager_backend/documents/models.py
+++ b/docmanager_backend/documents/models.py
@ -10,7 +10,9 @@ class Document(models.Model):
    DOCUMENT_TYPE_CHOICES = (
        ("memorandum", "Memorandum"),
        ("hoa", "HOA"),
-        # TODO: Update this list on types of documents
+        ("documented procedures manual", "Documented Procedures Manual"),
+        ("other", "Other"),
+
    )
    document_type = models.CharField(
        max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
--- a/docmanager_backend/documents/signals.py
+++ b/docmanager_backend/documents/signals.py
@ -1,3 +1,6 @@
+
+from io import BytesIO
+from documents.models import Document
 from django.db.models.signals import post_save
 from django.dispatch import receiver
 from config.settings import MEDIA_ROOT
@ -9,18 +12,25 @@ from .models import Document


@receiver(post_save, sender=Document)
-def domain_post_save(sender, instance, **kwargs):
+def document_post_save(sender, instance, **kwargs):
    if not instance.ocr_metadata:
        metadata = ""
        with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
            mat = fitz.Matrix(1.2, 1.2)
            for page in doc:
                pix = page.get_pixmap(matrix=mat)
-                output = f'{page.number}.jpg'
-                pix.save(output)
-                res = str(pytesseract.image_to_string(Image.open(output)))
-                os.remove(output)
-                metadata += res
+                # Convert pixmap to bytes
+                img_bytes = pix.tobytes()
+
+                # Create a BytesIO object
+                img_buffer = BytesIO(img_bytes)
+
+                # Create a PIL Image object from the bytes
+                img = Image.open(img_buffer)
+
+                # Perform OCR
+                text = pytesseract.image_to_string(img).strip()
+                metadata += text

        instance.ocr_metadata = metadata
        instance.save()