2025-09-15 04:03:37 +08:00
10 changed files with 10 additions and 170 deletions
--- a/2
+++ b/2
@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/
 RUN chmod +x /app/scripts/start.sh
 # Install packages
-RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr tmux
+RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr
 RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt
 # Expose port 8000 for the web server
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@ -10,8 +10,6 @@ services:
    environment:
      - PYTHONBUFFERED=1
    volumes:
      # File Watcher is broken in Windows Docker since Docker does not notify container about any file changes you make from Windows
      # If running on Windows, use a shared volume instead of bind mount
      - .:/app
  # SMTP Server
--- a/docmanager_backend/config/management/init.py
+++ b/docmanager_backend/config/management/init.py
--- a/docmanager_backend/config/management/commands/start_watcher.py
+++ b/docmanager_backend/config/management/commands/start_watcher.py
@ -1,117 +0,0 @@
 from django.core.management.base import BaseCommand, CommandError
 from io import BytesIO
 from documents.models import Document
 from PIL import Image
 import pytesseract
 import fitz
 import os
 from config.settings import MEDIA_ROOT
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 from documents.models import Document
 import logging
 import time
 class PDFHandler(FileSystemEventHandler):
    def __init__(self):
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s - %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
        self.logger = logging.getLogger(__name__)
        self.logger.info("Starting Document Watcher...")
    def on_created(self, event):
        if event.is_directory:
            return None
        if event.src_path.endswith('.pdf'):
            self.logger.info(f"New PDF file detected: {event.src_path}")
            self.process_pdf(event.src_path)
    def process_pdf(self, file_path):
        try:
            filename = os.path.basename(file_path)
            metadata = ""
            document_type = ""
            with fitz.open(file_path) as doc:
                num_pages = len(doc)
                for page_num in range(num_pages):
                    page = doc[page_num]
                    pix = page.get_pixmap(matrix=(1.2, 1.2))
                    # Convert pixmap to bytes
                    img_bytes = pix.tobytes()
                    # Create a BytesIO object
                    img_buffer = BytesIO(img_bytes)
                    # Create a PIL Image object from the bytes
                    img = Image.open(img_buffer)
                    # Perform OCR
                    text = pytesseract.image_to_string(img).strip()
                    lines = text.split('\n')
                    for line in lines:
                        if line.strip():
                            document_type = line.strip().lower()
                            break
                    if not document_type or document_type not in Document.DOCUMENT_TYPE_CHOICES:
                        document_type = "other"
                    metadata += text
            document, created = Document.objects.get_or_create(
                name=filename,
                defaults={
                    'number_pages': num_pages,
                    'ocr_metadata': metadata,
                    'document_type': document_type
                }
            )
            if created:
                self.logger.info(f"Document '{filename}' created successfully with type '{
                    document_type}'.")
            else:
                self.logger.info(f"Document '{filename}' already exists.")
            os.remove(file_path)
        except Exception as e:
            self.logger.error(f"Error processing PDF: {str(e)}")
 class PDFWatcher:
    def __init__(self):
        self.observer = Observer()
    def run(self):
        event_handler = PDFHandler()
        watch_directory = os.path.join(MEDIA_ROOT, "uploads")
        self.observer.schedule(
            event_handler, watch_directory, recursive=True)
        self.observer.start()
        try:
            while True:
                time.sleep(5)
        except:
            self.observer.stop()
        self.observer.join()
 class Command(BaseCommand):
    help = "Runs a dedicated file watcher service"
    def handle(self, *args, **options):
        watcher = PDFWatcher()
        watcher.run()
--- a/docmanager_backend/config/settings.py
+++ b/docmanager_backend/config/settings.py
@ -89,7 +89,6 @@ INSTALLED_APPS = [
    "corsheaders",
    "drf_spectacular",
    "drf_spectacular_sidecar",
    "config",
    "emails",
    "accounts",
    "documents",
@ -267,4 +266,4 @@ GRAPH_MODELS = {"app_labels": [
    "accounts", "documents", "document_requests", "questionnaires"]}
 CORS_ORIGIN_ALLOW_ALL = True
-CORS_ALLOW_CREDENTIALS = True
+CORS_ALLOW_CREDENTIALS = True
--- a/docmanager_backend/documents/migrations/0004_alter_document_document_type.py
+++ b/docmanager_backend/documents/migrations/0004_alter_document_document_type.py
@ -1,26 +0,0 @@
 # Generated by Django 5.1.3 on 2024-11-26 15:12
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ("documents", "0003_remove_document_metadata_document_ocr_metadata"),
    ]
    operations = [
        migrations.AlterField(
            model_name="document",
            name="document_type",
            field=models.CharField(
                choices=[
                    ("memorandum", "Memorandum"),
                    ("hoa", "HOA"),
                    ("documented procedures manual", "Documented Procedures Manual"),
                    ("other", "Other"),
                ],
                max_length=32,
            ),
        ),
    ]
--- a/docmanager_backend/documents/models.py
+++ b/docmanager_backend/documents/models.py
@ -10,9 +10,7 @@ class Document(models.Model):
    DOCUMENT_TYPE_CHOICES = (
        ("memorandum", "Memorandum"),
        ("hoa", "HOA"),
-        ("documented procedures manual", "Documented Procedures Manual"),
+        # TODO: Update this list on types of documents
        ("other", "Other"),
    )
    document_type = models.CharField(
        max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
--- a/docmanager_backend/documents/signals.py
+++ b/docmanager_backend/documents/signals.py
@ -1,6 +1,3 @@
 from io import BytesIO
 from documents.models import Document
 from django.db.models.signals import post_save
 from django.dispatch import receiver
 from config.settings import MEDIA_ROOT
@ -12,25 +9,18 @@ from .models import Document
@receiver(post_save, sender=Document)
-def document_post_save(sender, instance, **kwargs):
+def domain_post_save(sender, instance, **kwargs):
    if not instance.ocr_metadata:
        metadata = ""
        with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
            mat = fitz.Matrix(1.2, 1.2)
            for page in doc:
                pix = page.get_pixmap(matrix=mat)
-                # Convert pixmap to bytes
+                output = f'{page.number}.jpg'
-                img_bytes = pix.tobytes()
+                pix.save(output)
-
+                res = str(pytesseract.image_to_string(Image.open(output)))
-                # Create a BytesIO object
+                os.remove(output)
-                img_buffer = BytesIO(img_bytes)
+                metadata += res
                # Create a PIL Image object from the bytes
                img = Image.open(img_buffer)
                # Perform OCR
                text = pytesseract.image_to_string(img).strip()
                metadata += text
        instance.ocr_metadata = metadata
        instance.save()
--- a/requirements.txt
+++ b/requirements.txt
@ -57,6 +57,5 @@ typing_extensions==4.12.2
 tzdata==2024.2
 uritemplate==4.1.1
 urllib3==2.2.3
 watchdog==6.0.0
 whitenoise==6.8.2
 pygraphviz==1.14; platform_system == 'Linux'
--- a/scripts/start.sh
+++ b/scripts/start.sh
@ -8,8 +8,7 @@ if [ ! -d "static" ]; then
    echo "Generating static files"
    python manage.py collectstatic --noinput
 fi
-    tmux new-session -d -s "API File Watcher" "cd /app/docmanager_backend && python manage.py start_watcher"
+if [ "$DEBUG" = 'True' ]; then   
 if [ "$DEBUG" = 'True' ]; then
    python manage.py runserver "0.0.0.0:8000"
 else
    gunicorn --workers 8 --bind 0.0.0.0:8000 config.wsgi:application