From 5bc170f5198c0f2ffa2177847b45737f83574afd Mon Sep 17 00:00:00 2001 From: Keannu Bernasol Date: Wed, 27 Nov 2024 00:32:28 +0800 Subject: [PATCH] Add directory watcher service --- .../config/management/__init__.py | 0 .../management/commands/start_watcher.py | 117 ++++++++++++++++++ docmanager_backend/config/settings.py | 3 +- .../0004_alter_document_document_type.py | 26 ++++ docmanager_backend/documents/models.py | 4 +- docmanager_backend/documents/signals.py | 22 +++- 6 files changed, 164 insertions(+), 8 deletions(-) create mode 100644 docmanager_backend/config/management/__init__.py create mode 100644 docmanager_backend/config/management/commands/start_watcher.py create mode 100644 docmanager_backend/documents/migrations/0004_alter_document_document_type.py diff --git a/docmanager_backend/config/management/__init__.py b/docmanager_backend/config/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py new file mode 100644 index 0000000..1d655cf --- /dev/null +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -0,0 +1,117 @@ +from django.core.management.base import BaseCommand, CommandError + +from io import BytesIO +from documents.models import Document +from PIL import Image +import pytesseract +import fitz +import os +from config.settings import MEDIA_ROOT +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler +from documents.models import Document +import logging +import time + + +class PDFHandler(FileSystemEventHandler): + def __init__(self): + logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + + self.logger = logging.getLogger(__name__) + self.logger.info("Starting Document Watcher...") + + def on_created(self, event): + if event.is_directory: + return None + + if event.src_path.endswith('.pdf'): + self.logger.info(f"New PDF file detected: {event.src_path}") + self.process_pdf(event.src_path) + + def process_pdf(self, file_path): + try: + filename = os.path.basename(file_path) + metadata = "" + document_type = "" + + with fitz.open(file_path) as doc: + num_pages = len(doc) + + for page_num in range(num_pages): + page = doc[page_num] + pix = page.get_pixmap(matrix=(1.2, 1.2)) + + # Convert pixmap to bytes + img_bytes = pix.tobytes() + + # Create a BytesIO object + img_buffer = BytesIO(img_bytes) + + # Create a PIL Image object from the bytes + img = Image.open(img_buffer) + + # Perform OCR + text = pytesseract.image_to_string(img).strip() + + lines = text.split('\n') + + for line in lines: + if line.strip(): + document_type = line.strip().lower() + break + if not document_type or document_type not in Document.DOCUMENT_TYPE_CHOICES: + document_type = "other" + + metadata += text + + document, created = Document.objects.get_or_create( + name=filename, + defaults={ + 'number_pages': num_pages, + 'ocr_metadata': metadata, + 'document_type': document_type + } + ) + + if created: + self.logger.info(f"Document '{filename}' created successfully with type '{ + document_type}'.") + + else: + self.logger.info(f"Document '{filename}' already exists.") + + os.remove(file_path) + except Exception as e: + self.logger.error(f"Error processing PDF: {str(e)}") + + +class PDFWatcher: + def __init__(self): + self.observer = Observer() + + def run(self): + event_handler = PDFHandler() + watch_directory = f"{MEDIA_ROOT}/uploads" + + self.observer.schedule( + event_handler, watch_directory, recursive=True) + self.observer.start() + + try: + while True: + time.sleep(5) + except: + self.observer.stop() + + self.observer.join() + + +class Command(BaseCommand): + help = "Pushes data from local database into an existing DNS server" + + def handle(self, *args, **options): + watcher = PDFWatcher() + watcher.run() diff --git a/docmanager_backend/config/settings.py b/docmanager_backend/config/settings.py index 02f277f..5714198 100644 --- a/docmanager_backend/config/settings.py +++ b/docmanager_backend/config/settings.py @@ -89,6 +89,7 @@ INSTALLED_APPS = [ "corsheaders", "drf_spectacular", "drf_spectacular_sidecar", + "config", "emails", "accounts", "documents", @@ -266,4 +267,4 @@ GRAPH_MODELS = {"app_labels": [ "accounts", "documents", "document_requests", "questionnaires"]} CORS_ORIGIN_ALLOW_ALL = True -CORS_ALLOW_CREDENTIALS = True \ No newline at end of file +CORS_ALLOW_CREDENTIALS = True diff --git a/docmanager_backend/documents/migrations/0004_alter_document_document_type.py b/docmanager_backend/documents/migrations/0004_alter_document_document_type.py new file mode 100644 index 0000000..348e24f --- /dev/null +++ b/docmanager_backend/documents/migrations/0004_alter_document_document_type.py @@ -0,0 +1,26 @@ +# Generated by Django 5.1.3 on 2024-11-26 15:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "0003_remove_document_metadata_document_ocr_metadata"), + ] + + operations = [ + migrations.AlterField( + model_name="document", + name="document_type", + field=models.CharField( + choices=[ + ("memorandum", "Memorandum"), + ("hoa", "HOA"), + ("documented procedures manual", "Documented Procedures Manual"), + ("other", "Other"), + ], + max_length=32, + ), + ), + ] diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py index d7c750b..76c902b 100644 --- a/docmanager_backend/documents/models.py +++ b/docmanager_backend/documents/models.py @@ -10,7 +10,9 @@ class Document(models.Model): DOCUMENT_TYPE_CHOICES = ( ("memorandum", "Memorandum"), ("hoa", "HOA"), - # TODO: Update this list on types of documents + ("documented procedures manual", "Documented Procedures Manual"), + ("other", "Other"), + ) document_type = models.CharField( max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False diff --git a/docmanager_backend/documents/signals.py b/docmanager_backend/documents/signals.py index 0b7621c..34273c4 100644 --- a/docmanager_backend/documents/signals.py +++ b/docmanager_backend/documents/signals.py @@ -1,3 +1,6 @@ + +from io import BytesIO +from documents.models import Document from django.db.models.signals import post_save from django.dispatch import receiver from config.settings import MEDIA_ROOT @@ -9,18 +12,25 @@ from .models import Document @receiver(post_save, sender=Document) -def domain_post_save(sender, instance, **kwargs): +def document_post_save(sender, instance, **kwargs): if not instance.ocr_metadata: metadata = "" with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc: mat = fitz.Matrix(1.2, 1.2) for page in doc: pix = page.get_pixmap(matrix=mat) - output = f'{page.number}.jpg' - pix.save(output) - res = str(pytesseract.image_to_string(Image.open(output))) - os.remove(output) - metadata += res + # Convert pixmap to bytes + img_bytes = pix.tobytes() + + # Create a BytesIO object + img_buffer = BytesIO(img_bytes) + + # Create a PIL Image object from the bytes + img = Image.open(img_buffer) + + # Perform OCR + text = pytesseract.image_to_string(img).strip() + metadata += text instance.ocr_metadata = metadata instance.save()