mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-01-18 17:13:00 +08:00
Add directory watcher service
This commit is contained in:
parent
957272cd07
commit
5bc170f519
6 changed files with 164 additions and 8 deletions
0
docmanager_backend/config/management/__init__.py
Normal file
0
docmanager_backend/config/management/__init__.py
Normal file
117
docmanager_backend/config/management/commands/start_watcher.py
Normal file
117
docmanager_backend/config/management/commands/start_watcher.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from io import BytesIO
|
||||
from documents.models import Document
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
import fitz
|
||||
import os
|
||||
from config.settings import MEDIA_ROOT
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from documents.models import Document
|
||||
import logging
|
||||
import time
|
||||
|
||||
|
||||
class PDFHandler(FileSystemEventHandler):
|
||||
def __init__(self):
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S')
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logger.info("Starting Document Watcher...")
|
||||
|
||||
def on_created(self, event):
|
||||
if event.is_directory:
|
||||
return None
|
||||
|
||||
if event.src_path.endswith('.pdf'):
|
||||
self.logger.info(f"New PDF file detected: {event.src_path}")
|
||||
self.process_pdf(event.src_path)
|
||||
|
||||
def process_pdf(self, file_path):
|
||||
try:
|
||||
filename = os.path.basename(file_path)
|
||||
metadata = ""
|
||||
document_type = ""
|
||||
|
||||
with fitz.open(file_path) as doc:
|
||||
num_pages = len(doc)
|
||||
|
||||
for page_num in range(num_pages):
|
||||
page = doc[page_num]
|
||||
pix = page.get_pixmap(matrix=(1.2, 1.2))
|
||||
|
||||
# Convert pixmap to bytes
|
||||
img_bytes = pix.tobytes()
|
||||
|
||||
# Create a BytesIO object
|
||||
img_buffer = BytesIO(img_bytes)
|
||||
|
||||
# Create a PIL Image object from the bytes
|
||||
img = Image.open(img_buffer)
|
||||
|
||||
# Perform OCR
|
||||
text = pytesseract.image_to_string(img).strip()
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
document_type = line.strip().lower()
|
||||
break
|
||||
if not document_type or document_type not in Document.DOCUMENT_TYPE_CHOICES:
|
||||
document_type = "other"
|
||||
|
||||
metadata += text
|
||||
|
||||
document, created = Document.objects.get_or_create(
|
||||
name=filename,
|
||||
defaults={
|
||||
'number_pages': num_pages,
|
||||
'ocr_metadata': metadata,
|
||||
'document_type': document_type
|
||||
}
|
||||
)
|
||||
|
||||
if created:
|
||||
self.logger.info(f"Document '{filename}' created successfully with type '{
|
||||
document_type}'.")
|
||||
|
||||
else:
|
||||
self.logger.info(f"Document '{filename}' already exists.")
|
||||
|
||||
os.remove(file_path)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing PDF: {str(e)}")
|
||||
|
||||
|
||||
class PDFWatcher:
|
||||
def __init__(self):
|
||||
self.observer = Observer()
|
||||
|
||||
def run(self):
|
||||
event_handler = PDFHandler()
|
||||
watch_directory = f"{MEDIA_ROOT}/uploads"
|
||||
|
||||
self.observer.schedule(
|
||||
event_handler, watch_directory, recursive=True)
|
||||
self.observer.start()
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(5)
|
||||
except:
|
||||
self.observer.stop()
|
||||
|
||||
self.observer.join()
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Pushes data from local database into an existing DNS server"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
watcher = PDFWatcher()
|
||||
watcher.run()
|
|
@ -89,6 +89,7 @@ INSTALLED_APPS = [
|
|||
"corsheaders",
|
||||
"drf_spectacular",
|
||||
"drf_spectacular_sidecar",
|
||||
"config",
|
||||
"emails",
|
||||
"accounts",
|
||||
"documents",
|
||||
|
@ -266,4 +267,4 @@ GRAPH_MODELS = {"app_labels": [
|
|||
"accounts", "documents", "document_requests", "questionnaires"]}
|
||||
|
||||
CORS_ORIGIN_ALLOW_ALL = True
|
||||
CORS_ALLOW_CREDENTIALS = True
|
||||
CORS_ALLOW_CREDENTIALS = True
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# Generated by Django 5.1.3 on 2024-11-26 15:12
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("documents", "0003_remove_document_metadata_document_ocr_metadata"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="document_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("memorandum", "Memorandum"),
|
||||
("hoa", "HOA"),
|
||||
("documented procedures manual", "Documented Procedures Manual"),
|
||||
("other", "Other"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
]
|
|
@ -10,7 +10,9 @@ class Document(models.Model):
|
|||
DOCUMENT_TYPE_CHOICES = (
|
||||
("memorandum", "Memorandum"),
|
||||
("hoa", "HOA"),
|
||||
# TODO: Update this list on types of documents
|
||||
("documented procedures manual", "Documented Procedures Manual"),
|
||||
("other", "Other"),
|
||||
|
||||
)
|
||||
document_type = models.CharField(
|
||||
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
|
||||
from io import BytesIO
|
||||
from documents.models import Document
|
||||
from django.db.models.signals import post_save
|
||||
from django.dispatch import receiver
|
||||
from config.settings import MEDIA_ROOT
|
||||
|
@ -9,18 +12,25 @@ from .models import Document
|
|||
|
||||
|
||||
@receiver(post_save, sender=Document)
|
||||
def domain_post_save(sender, instance, **kwargs):
|
||||
def document_post_save(sender, instance, **kwargs):
|
||||
if not instance.ocr_metadata:
|
||||
metadata = ""
|
||||
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
|
||||
mat = fitz.Matrix(1.2, 1.2)
|
||||
for page in doc:
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
output = f'{page.number}.jpg'
|
||||
pix.save(output)
|
||||
res = str(pytesseract.image_to_string(Image.open(output)))
|
||||
os.remove(output)
|
||||
metadata += res
|
||||
# Convert pixmap to bytes
|
||||
img_bytes = pix.tobytes()
|
||||
|
||||
# Create a BytesIO object
|
||||
img_buffer = BytesIO(img_bytes)
|
||||
|
||||
# Create a PIL Image object from the bytes
|
||||
img = Image.open(img_buffer)
|
||||
|
||||
# Perform OCR
|
||||
text = pytesseract.image_to_string(img).strip()
|
||||
metadata += text
|
||||
|
||||
instance.ocr_metadata = metadata
|
||||
instance.save()
|
||||
|
|
Loading…
Reference in a new issue