Compare commits

..

No commits in common. "463e33d219fd76b685d481da363f0651f954fcc4" and "957272cd07cc32eea6f5cd94c314a6f246469b7f" have entirely different histories.

10 changed files with 10 additions and 170 deletions

View file

@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/
RUN chmod +x /app/scripts/start.sh RUN chmod +x /app/scripts/start.sh
# Install packages # Install packages
RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr tmux RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr
RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt
# Expose port 8000 for the web server # Expose port 8000 for the web server

View file

@ -10,8 +10,6 @@ services:
environment: environment:
- PYTHONBUFFERED=1 - PYTHONBUFFERED=1
volumes: volumes:
# File Watcher is broken in Windows Docker since Docker does not notify container about any file changes you make from Windows
# If running on Windows, use a shared volume instead of bind mount
- .:/app - .:/app
# SMTP Server # SMTP Server

View file

@ -1,117 +0,0 @@
from django.core.management.base import BaseCommand, CommandError
from io import BytesIO
from documents.models import Document
from PIL import Image
import pytesseract
import fitz
import os
from config.settings import MEDIA_ROOT
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from documents.models import Document
import logging
import time
class PDFHandler(FileSystemEventHandler):
def __init__(self):
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
self.logger = logging.getLogger(__name__)
self.logger.info("Starting Document Watcher...")
def on_created(self, event):
if event.is_directory:
return None
if event.src_path.endswith('.pdf'):
self.logger.info(f"New PDF file detected: {event.src_path}")
self.process_pdf(event.src_path)
def process_pdf(self, file_path):
try:
filename = os.path.basename(file_path)
metadata = ""
document_type = ""
with fitz.open(file_path) as doc:
num_pages = len(doc)
for page_num in range(num_pages):
page = doc[page_num]
pix = page.get_pixmap(matrix=(1.2, 1.2))
# Convert pixmap to bytes
img_bytes = pix.tobytes()
# Create a BytesIO object
img_buffer = BytesIO(img_bytes)
# Create a PIL Image object from the bytes
img = Image.open(img_buffer)
# Perform OCR
text = pytesseract.image_to_string(img).strip()
lines = text.split('\n')
for line in lines:
if line.strip():
document_type = line.strip().lower()
break
if not document_type or document_type not in Document.DOCUMENT_TYPE_CHOICES:
document_type = "other"
metadata += text
document, created = Document.objects.get_or_create(
name=filename,
defaults={
'number_pages': num_pages,
'ocr_metadata': metadata,
'document_type': document_type
}
)
if created:
self.logger.info(f"Document '{filename}' created successfully with type '{
document_type}'.")
else:
self.logger.info(f"Document '{filename}' already exists.")
os.remove(file_path)
except Exception as e:
self.logger.error(f"Error processing PDF: {str(e)}")
class PDFWatcher:
def __init__(self):
self.observer = Observer()
def run(self):
event_handler = PDFHandler()
watch_directory = os.path.join(MEDIA_ROOT, "uploads")
self.observer.schedule(
event_handler, watch_directory, recursive=True)
self.observer.start()
try:
while True:
time.sleep(5)
except:
self.observer.stop()
self.observer.join()
class Command(BaseCommand):
help = "Runs a dedicated file watcher service"
def handle(self, *args, **options):
watcher = PDFWatcher()
watcher.run()

View file

@ -89,7 +89,6 @@ INSTALLED_APPS = [
"corsheaders", "corsheaders",
"drf_spectacular", "drf_spectacular",
"drf_spectacular_sidecar", "drf_spectacular_sidecar",
"config",
"emails", "emails",
"accounts", "accounts",
"documents", "documents",
@ -267,4 +266,4 @@ GRAPH_MODELS = {"app_labels": [
"accounts", "documents", "document_requests", "questionnaires"]} "accounts", "documents", "document_requests", "questionnaires"]}
CORS_ORIGIN_ALLOW_ALL = True CORS_ORIGIN_ALLOW_ALL = True
CORS_ALLOW_CREDENTIALS = True CORS_ALLOW_CREDENTIALS = True

View file

@ -1,26 +0,0 @@
# Generated by Django 5.1.3 on 2024-11-26 15:12
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("documents", "0003_remove_document_metadata_document_ocr_metadata"),
]
operations = [
migrations.AlterField(
model_name="document",
name="document_type",
field=models.CharField(
choices=[
("memorandum", "Memorandum"),
("hoa", "HOA"),
("documented procedures manual", "Documented Procedures Manual"),
("other", "Other"),
],
max_length=32,
),
),
]

View file

@ -10,9 +10,7 @@ class Document(models.Model):
DOCUMENT_TYPE_CHOICES = ( DOCUMENT_TYPE_CHOICES = (
("memorandum", "Memorandum"), ("memorandum", "Memorandum"),
("hoa", "HOA"), ("hoa", "HOA"),
("documented procedures manual", "Documented Procedures Manual"), # TODO: Update this list on types of documents
("other", "Other"),
) )
document_type = models.CharField( document_type = models.CharField(
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False

View file

@ -1,6 +1,3 @@
from io import BytesIO
from documents.models import Document
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.dispatch import receiver from django.dispatch import receiver
from config.settings import MEDIA_ROOT from config.settings import MEDIA_ROOT
@ -12,25 +9,18 @@ from .models import Document
@receiver(post_save, sender=Document) @receiver(post_save, sender=Document)
def document_post_save(sender, instance, **kwargs): def domain_post_save(sender, instance, **kwargs):
if not instance.ocr_metadata: if not instance.ocr_metadata:
metadata = "" metadata = ""
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc: with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
mat = fitz.Matrix(1.2, 1.2) mat = fitz.Matrix(1.2, 1.2)
for page in doc: for page in doc:
pix = page.get_pixmap(matrix=mat) pix = page.get_pixmap(matrix=mat)
# Convert pixmap to bytes output = f'{page.number}.jpg'
img_bytes = pix.tobytes() pix.save(output)
res = str(pytesseract.image_to_string(Image.open(output)))
# Create a BytesIO object os.remove(output)
img_buffer = BytesIO(img_bytes) metadata += res
# Create a PIL Image object from the bytes
img = Image.open(img_buffer)
# Perform OCR
text = pytesseract.image_to_string(img).strip()
metadata += text
instance.ocr_metadata = metadata instance.ocr_metadata = metadata
instance.save() instance.save()

View file

@ -57,6 +57,5 @@ typing_extensions==4.12.2
tzdata==2024.2 tzdata==2024.2
uritemplate==4.1.1 uritemplate==4.1.1
urllib3==2.2.3 urllib3==2.2.3
watchdog==6.0.0
whitenoise==6.8.2 whitenoise==6.8.2
pygraphviz==1.14; platform_system == 'Linux' pygraphviz==1.14; platform_system == 'Linux'

View file

@ -8,8 +8,7 @@ if [ ! -d "static" ]; then
echo "Generating static files" echo "Generating static files"
python manage.py collectstatic --noinput python manage.py collectstatic --noinput
fi fi
tmux new-session -d -s "API File Watcher" "cd /app/docmanager_backend && python manage.py start_watcher" if [ "$DEBUG" = 'True' ]; then
if [ "$DEBUG" = 'True' ]; then
python manage.py runserver "0.0.0.0:8000" python manage.py runserver "0.0.0.0:8000"
else else
gunicorn --workers 8 --bind 0.0.0.0:8000 config.wsgi:application gunicorn --workers 8 --bind 0.0.0.0:8000 config.wsgi:application