diff --git a/Dockerfile b/Dockerfile index eab3bc7..ba0ca07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/ RUN chmod +x /app/scripts/start.sh # Install packages -RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr tmux +RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt # Expose port 8000 for the web server diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 933247a..98f8632 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -10,8 +10,6 @@ services: environment: - PYTHONBUFFERED=1 volumes: - # File Watcher is broken in Windows Docker since Docker does not notify container about any file changes you make from Windows - # If running on Windows, use a shared volume instead of bind mount - .:/app # SMTP Server diff --git a/docmanager_backend/config/management/__init__.py b/docmanager_backend/config/management/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py deleted file mode 100644 index a6b30cf..0000000 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ /dev/null @@ -1,117 +0,0 @@ -from django.core.management.base import BaseCommand, CommandError - -from io import BytesIO -from documents.models import Document -from PIL import Image -import pytesseract -import fitz -import os -from config.settings import MEDIA_ROOT -from watchdog.observers import Observer -from watchdog.events import FileSystemEventHandler -from documents.models import Document -import logging -import time - - -class PDFHandler(FileSystemEventHandler): - def __init__(self): - logging.basicConfig(level=logging.INFO, - format='%(asctime)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - - self.logger = logging.getLogger(__name__) - self.logger.info("Starting Document Watcher...") - - def on_created(self, event): - if event.is_directory: - return None - - if event.src_path.endswith('.pdf'): - self.logger.info(f"New PDF file detected: {event.src_path}") - self.process_pdf(event.src_path) - - def process_pdf(self, file_path): - try: - filename = os.path.basename(file_path) - metadata = "" - document_type = "" - - with fitz.open(file_path) as doc: - num_pages = len(doc) - - for page_num in range(num_pages): - page = doc[page_num] - pix = page.get_pixmap(matrix=(1.2, 1.2)) - - # Convert pixmap to bytes - img_bytes = pix.tobytes() - - # Create a BytesIO object - img_buffer = BytesIO(img_bytes) - - # Create a PIL Image object from the bytes - img = Image.open(img_buffer) - - # Perform OCR - text = pytesseract.image_to_string(img).strip() - - lines = text.split('\n') - - for line in lines: - if line.strip(): - document_type = line.strip().lower() - break - if not document_type or document_type not in Document.DOCUMENT_TYPE_CHOICES: - document_type = "other" - - metadata += text - - document, created = Document.objects.get_or_create( - name=filename, - defaults={ - 'number_pages': num_pages, - 'ocr_metadata': metadata, - 'document_type': document_type - } - ) - - if created: - self.logger.info(f"Document '{filename}' created successfully with type '{ - document_type}'.") - - else: - self.logger.info(f"Document '{filename}' already exists.") - - os.remove(file_path) - except Exception as e: - self.logger.error(f"Error processing PDF: {str(e)}") - - -class PDFWatcher: - def __init__(self): - self.observer = Observer() - - def run(self): - event_handler = PDFHandler() - watch_directory = os.path.join(MEDIA_ROOT, "uploads") - - self.observer.schedule( - event_handler, watch_directory, recursive=True) - self.observer.start() - - try: - while True: - time.sleep(5) - except: - self.observer.stop() - - self.observer.join() - - -class Command(BaseCommand): - help = "Runs a dedicated file watcher service" - - def handle(self, *args, **options): - watcher = PDFWatcher() - watcher.run() diff --git a/docmanager_backend/config/settings.py b/docmanager_backend/config/settings.py index 5714198..02f277f 100644 --- a/docmanager_backend/config/settings.py +++ b/docmanager_backend/config/settings.py @@ -89,7 +89,6 @@ INSTALLED_APPS = [ "corsheaders", "drf_spectacular", "drf_spectacular_sidecar", - "config", "emails", "accounts", "documents", @@ -267,4 +266,4 @@ GRAPH_MODELS = {"app_labels": [ "accounts", "documents", "document_requests", "questionnaires"]} CORS_ORIGIN_ALLOW_ALL = True -CORS_ALLOW_CREDENTIALS = True +CORS_ALLOW_CREDENTIALS = True \ No newline at end of file diff --git a/docmanager_backend/documents/migrations/0004_alter_document_document_type.py b/docmanager_backend/documents/migrations/0004_alter_document_document_type.py deleted file mode 100644 index 348e24f..0000000 --- a/docmanager_backend/documents/migrations/0004_alter_document_document_type.py +++ /dev/null @@ -1,26 +0,0 @@ -# Generated by Django 5.1.3 on 2024-11-26 15:12 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("documents", "0003_remove_document_metadata_document_ocr_metadata"), - ] - - operations = [ - migrations.AlterField( - model_name="document", - name="document_type", - field=models.CharField( - choices=[ - ("memorandum", "Memorandum"), - ("hoa", "HOA"), - ("documented procedures manual", "Documented Procedures Manual"), - ("other", "Other"), - ], - max_length=32, - ), - ), - ] diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py index 76c902b..d7c750b 100644 --- a/docmanager_backend/documents/models.py +++ b/docmanager_backend/documents/models.py @@ -10,9 +10,7 @@ class Document(models.Model): DOCUMENT_TYPE_CHOICES = ( ("memorandum", "Memorandum"), ("hoa", "HOA"), - ("documented procedures manual", "Documented Procedures Manual"), - ("other", "Other"), - + # TODO: Update this list on types of documents ) document_type = models.CharField( max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False diff --git a/docmanager_backend/documents/signals.py b/docmanager_backend/documents/signals.py index 34273c4..0b7621c 100644 --- a/docmanager_backend/documents/signals.py +++ b/docmanager_backend/documents/signals.py @@ -1,6 +1,3 @@ - -from io import BytesIO -from documents.models import Document from django.db.models.signals import post_save from django.dispatch import receiver from config.settings import MEDIA_ROOT @@ -12,25 +9,18 @@ from .models import Document @receiver(post_save, sender=Document) -def document_post_save(sender, instance, **kwargs): +def domain_post_save(sender, instance, **kwargs): if not instance.ocr_metadata: metadata = "" with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc: mat = fitz.Matrix(1.2, 1.2) for page in doc: pix = page.get_pixmap(matrix=mat) - # Convert pixmap to bytes - img_bytes = pix.tobytes() - - # Create a BytesIO object - img_buffer = BytesIO(img_bytes) - - # Create a PIL Image object from the bytes - img = Image.open(img_buffer) - - # Perform OCR - text = pytesseract.image_to_string(img).strip() - metadata += text + output = f'{page.number}.jpg' + pix.save(output) + res = str(pytesseract.image_to_string(Image.open(output))) + os.remove(output) + metadata += res instance.ocr_metadata = metadata instance.save() diff --git a/requirements.txt b/requirements.txt index 5e3db18..eb57c31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -57,6 +57,5 @@ typing_extensions==4.12.2 tzdata==2024.2 uritemplate==4.1.1 urllib3==2.2.3 -watchdog==6.0.0 whitenoise==6.8.2 pygraphviz==1.14; platform_system == 'Linux' \ No newline at end of file diff --git a/scripts/start.sh b/scripts/start.sh index 4fffa6f..ee81d26 100644 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -8,8 +8,7 @@ if [ ! -d "static" ]; then echo "Generating static files" python manage.py collectstatic --noinput fi - tmux new-session -d -s "API File Watcher" "cd /app/docmanager_backend && python manage.py start_watcher" -if [ "$DEBUG" = 'True' ]; then +if [ "$DEBUG" = 'True' ]; then python manage.py runserver "0.0.0.0:8000" else gunicorn --workers 8 --bind 0.0.0.0:8000 config.wsgi:application