mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-04-20 08:51:31 +08:00
Compare commits
No commits in common. "463e33d219fd76b685d481da363f0651f954fcc4" and "957272cd07cc32eea6f5cd94c314a6f246469b7f" have entirely different histories.
463e33d219
...
957272cd07
10 changed files with 10 additions and 170 deletions
|
@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/
|
||||||
RUN chmod +x /app/scripts/start.sh
|
RUN chmod +x /app/scripts/start.sh
|
||||||
|
|
||||||
# Install packages
|
# Install packages
|
||||||
RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr tmux
|
RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr
|
||||||
RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt
|
RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
# Expose port 8000 for the web server
|
# Expose port 8000 for the web server
|
||||||
|
|
|
@ -10,8 +10,6 @@ services:
|
||||||
environment:
|
environment:
|
||||||
- PYTHONBUFFERED=1
|
- PYTHONBUFFERED=1
|
||||||
volumes:
|
volumes:
|
||||||
# File Watcher is broken in Windows Docker since Docker does not notify container about any file changes you make from Windows
|
|
||||||
# If running on Windows, use a shared volume instead of bind mount
|
|
||||||
- .:/app
|
- .:/app
|
||||||
|
|
||||||
# SMTP Server
|
# SMTP Server
|
||||||
|
|
|
@ -1,117 +0,0 @@
|
||||||
from django.core.management.base import BaseCommand, CommandError
|
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
from documents.models import Document
|
|
||||||
from PIL import Image
|
|
||||||
import pytesseract
|
|
||||||
import fitz
|
|
||||||
import os
|
|
||||||
from config.settings import MEDIA_ROOT
|
|
||||||
from watchdog.observers import Observer
|
|
||||||
from watchdog.events import FileSystemEventHandler
|
|
||||||
from documents.models import Document
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(FileSystemEventHandler):
|
|
||||||
def __init__(self):
|
|
||||||
logging.basicConfig(level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(message)s',
|
|
||||||
datefmt='%Y-%m-%d %H:%M:%S')
|
|
||||||
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
self.logger.info("Starting Document Watcher...")
|
|
||||||
|
|
||||||
def on_created(self, event):
|
|
||||||
if event.is_directory:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if event.src_path.endswith('.pdf'):
|
|
||||||
self.logger.info(f"New PDF file detected: {event.src_path}")
|
|
||||||
self.process_pdf(event.src_path)
|
|
||||||
|
|
||||||
def process_pdf(self, file_path):
|
|
||||||
try:
|
|
||||||
filename = os.path.basename(file_path)
|
|
||||||
metadata = ""
|
|
||||||
document_type = ""
|
|
||||||
|
|
||||||
with fitz.open(file_path) as doc:
|
|
||||||
num_pages = len(doc)
|
|
||||||
|
|
||||||
for page_num in range(num_pages):
|
|
||||||
page = doc[page_num]
|
|
||||||
pix = page.get_pixmap(matrix=(1.2, 1.2))
|
|
||||||
|
|
||||||
# Convert pixmap to bytes
|
|
||||||
img_bytes = pix.tobytes()
|
|
||||||
|
|
||||||
# Create a BytesIO object
|
|
||||||
img_buffer = BytesIO(img_bytes)
|
|
||||||
|
|
||||||
# Create a PIL Image object from the bytes
|
|
||||||
img = Image.open(img_buffer)
|
|
||||||
|
|
||||||
# Perform OCR
|
|
||||||
text = pytesseract.image_to_string(img).strip()
|
|
||||||
|
|
||||||
lines = text.split('\n')
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if line.strip():
|
|
||||||
document_type = line.strip().lower()
|
|
||||||
break
|
|
||||||
if not document_type or document_type not in Document.DOCUMENT_TYPE_CHOICES:
|
|
||||||
document_type = "other"
|
|
||||||
|
|
||||||
metadata += text
|
|
||||||
|
|
||||||
document, created = Document.objects.get_or_create(
|
|
||||||
name=filename,
|
|
||||||
defaults={
|
|
||||||
'number_pages': num_pages,
|
|
||||||
'ocr_metadata': metadata,
|
|
||||||
'document_type': document_type
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
if created:
|
|
||||||
self.logger.info(f"Document '{filename}' created successfully with type '{
|
|
||||||
document_type}'.")
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.logger.info(f"Document '{filename}' already exists.")
|
|
||||||
|
|
||||||
os.remove(file_path)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error processing PDF: {str(e)}")
|
|
||||||
|
|
||||||
|
|
||||||
class PDFWatcher:
|
|
||||||
def __init__(self):
|
|
||||||
self.observer = Observer()
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
event_handler = PDFHandler()
|
|
||||||
watch_directory = os.path.join(MEDIA_ROOT, "uploads")
|
|
||||||
|
|
||||||
self.observer.schedule(
|
|
||||||
event_handler, watch_directory, recursive=True)
|
|
||||||
self.observer.start()
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
time.sleep(5)
|
|
||||||
except:
|
|
||||||
self.observer.stop()
|
|
||||||
|
|
||||||
self.observer.join()
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
help = "Runs a dedicated file watcher service"
|
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
|
||||||
watcher = PDFWatcher()
|
|
||||||
watcher.run()
|
|
|
@ -89,7 +89,6 @@ INSTALLED_APPS = [
|
||||||
"corsheaders",
|
"corsheaders",
|
||||||
"drf_spectacular",
|
"drf_spectacular",
|
||||||
"drf_spectacular_sidecar",
|
"drf_spectacular_sidecar",
|
||||||
"config",
|
|
||||||
"emails",
|
"emails",
|
||||||
"accounts",
|
"accounts",
|
||||||
"documents",
|
"documents",
|
||||||
|
@ -267,4 +266,4 @@ GRAPH_MODELS = {"app_labels": [
|
||||||
"accounts", "documents", "document_requests", "questionnaires"]}
|
"accounts", "documents", "document_requests", "questionnaires"]}
|
||||||
|
|
||||||
CORS_ORIGIN_ALLOW_ALL = True
|
CORS_ORIGIN_ALLOW_ALL = True
|
||||||
CORS_ALLOW_CREDENTIALS = True
|
CORS_ALLOW_CREDENTIALS = True
|
|
@ -1,26 +0,0 @@
|
||||||
# Generated by Django 5.1.3 on 2024-11-26 15:12
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
("documents", "0003_remove_document_metadata_document_ocr_metadata"),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name="document",
|
|
||||||
name="document_type",
|
|
||||||
field=models.CharField(
|
|
||||||
choices=[
|
|
||||||
("memorandum", "Memorandum"),
|
|
||||||
("hoa", "HOA"),
|
|
||||||
("documented procedures manual", "Documented Procedures Manual"),
|
|
||||||
("other", "Other"),
|
|
||||||
],
|
|
||||||
max_length=32,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
|
@ -10,9 +10,7 @@ class Document(models.Model):
|
||||||
DOCUMENT_TYPE_CHOICES = (
|
DOCUMENT_TYPE_CHOICES = (
|
||||||
("memorandum", "Memorandum"),
|
("memorandum", "Memorandum"),
|
||||||
("hoa", "HOA"),
|
("hoa", "HOA"),
|
||||||
("documented procedures manual", "Documented Procedures Manual"),
|
# TODO: Update this list on types of documents
|
||||||
("other", "Other"),
|
|
||||||
|
|
||||||
)
|
)
|
||||||
document_type = models.CharField(
|
document_type = models.CharField(
|
||||||
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
|
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
from documents.models import Document
|
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
from config.settings import MEDIA_ROOT
|
from config.settings import MEDIA_ROOT
|
||||||
|
@ -12,25 +9,18 @@ from .models import Document
|
||||||
|
|
||||||
|
|
||||||
@receiver(post_save, sender=Document)
|
@receiver(post_save, sender=Document)
|
||||||
def document_post_save(sender, instance, **kwargs):
|
def domain_post_save(sender, instance, **kwargs):
|
||||||
if not instance.ocr_metadata:
|
if not instance.ocr_metadata:
|
||||||
metadata = ""
|
metadata = ""
|
||||||
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
|
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
|
||||||
mat = fitz.Matrix(1.2, 1.2)
|
mat = fitz.Matrix(1.2, 1.2)
|
||||||
for page in doc:
|
for page in doc:
|
||||||
pix = page.get_pixmap(matrix=mat)
|
pix = page.get_pixmap(matrix=mat)
|
||||||
# Convert pixmap to bytes
|
output = f'{page.number}.jpg'
|
||||||
img_bytes = pix.tobytes()
|
pix.save(output)
|
||||||
|
res = str(pytesseract.image_to_string(Image.open(output)))
|
||||||
# Create a BytesIO object
|
os.remove(output)
|
||||||
img_buffer = BytesIO(img_bytes)
|
metadata += res
|
||||||
|
|
||||||
# Create a PIL Image object from the bytes
|
|
||||||
img = Image.open(img_buffer)
|
|
||||||
|
|
||||||
# Perform OCR
|
|
||||||
text = pytesseract.image_to_string(img).strip()
|
|
||||||
metadata += text
|
|
||||||
|
|
||||||
instance.ocr_metadata = metadata
|
instance.ocr_metadata = metadata
|
||||||
instance.save()
|
instance.save()
|
||||||
|
|
|
@ -57,6 +57,5 @@ typing_extensions==4.12.2
|
||||||
tzdata==2024.2
|
tzdata==2024.2
|
||||||
uritemplate==4.1.1
|
uritemplate==4.1.1
|
||||||
urllib3==2.2.3
|
urllib3==2.2.3
|
||||||
watchdog==6.0.0
|
|
||||||
whitenoise==6.8.2
|
whitenoise==6.8.2
|
||||||
pygraphviz==1.14; platform_system == 'Linux'
|
pygraphviz==1.14; platform_system == 'Linux'
|
|
@ -8,8 +8,7 @@ if [ ! -d "static" ]; then
|
||||||
echo "Generating static files"
|
echo "Generating static files"
|
||||||
python manage.py collectstatic --noinput
|
python manage.py collectstatic --noinput
|
||||||
fi
|
fi
|
||||||
tmux new-session -d -s "API File Watcher" "cd /app/docmanager_backend && python manage.py start_watcher"
|
if [ "$DEBUG" = 'True' ]; then
|
||||||
if [ "$DEBUG" = 'True' ]; then
|
|
||||||
python manage.py runserver "0.0.0.0:8000"
|
python manage.py runserver "0.0.0.0:8000"
|
||||||
else
|
else
|
||||||
gunicorn --workers 8 --bind 0.0.0.0:8000 config.wsgi:application
|
gunicorn --workers 8 --bind 0.0.0.0:8000 config.wsgi:application
|
||||||
|
|
Loading…
Add table
Reference in a new issue