diff --git a/.env.sample b/.env.sample index 362394d..69f9163 100644 --- a/.env.sample +++ b/.env.sample @@ -20,4 +20,9 @@ ADMIN_EMAIL = 'admin@test.com' ADMIN_PASSWORD = '' # To insert test data or not (UNUSED) -TEST_DATA = "True" \ No newline at end of file +TEST_DATA = "True" + +# Ollama for Categorization +OLLAMA_URL = "" +OLLAMA_USERNAME = "" +OLLAMA_PASSWORD = "" \ No newline at end of file diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py index 656afcb..0eb0998 100644 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -1,3 +1,6 @@ +from ollama import ChatResponse +import base64 +import httpx from django.core.management.base import BaseCommand, CommandError from io import BytesIO @@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler from documents.models import Document +from config.settings import get_secret from django.core.files import File import logging import time +from ollama import Client class PDFHandler(FileSystemEventHandler): @@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler): def process_pdf(self, file_path): try: - filename = os.path.basename(file_path) - filename = str(filename).replace(" ", "") + # Get the original filename and directory + original_filename = os.path.basename(file_path) + original_dir = os.path.dirname(file_path) + + # Check if the filename contains spaces + if " " in original_filename: + # Create the new filename by replacing spaces + new_filename = original_filename.replace(" ", "_") + + # Construct the new full file path + new_file_path = os.path.join(original_dir, new_filename) + + # Rename the file + os.rename(file_path, new_file_path) + + # Update the filename and file_path variables + filename = new_filename + file_path = new_file_path + else: + filename = original_filename metadata = "" document_type = "" @@ -60,14 +83,64 @@ class PDFHandler(FileSystemEventHandler): # Perform OCR text = pytesseract.image_to_string(img).strip() - lines = text.split("\n") + # Get document category + # Try to pass image to the Ollama image recognition API first + try: + client = Client( + host=get_secret("OLLAMA_URL"), + auth=httpx.BasicAuth( + username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) + ) - for line in lines: - if line.strip(): - document_type = line.strip().lower() + encoded_image = base64.b64encode( + img_buffer.getvalue()).decode() + + attempts = 0 + while True: + if attempts >= 3: + raise Exception( + "Unable to categorize using Ollama API") + attempts += 1 + + content = f""" + Read the text from the image and provide a category. + + Possible categories are: Announcement, Manual, Form + + Respond only with the category. No explanations are necessary. + """ + + response: ChatResponse = client.chat( + model="llama3.2-vision", + messages=[ + {"role": "user", "content": content, + "images": [encoded_image]}, + ], + ) + + document_type = response["message"]["content"].split(":")[ + 0].replace("*", "").replace(".", "") + + # A few safety checks if the model does not follow through with output instructions + if len(document_type) > 16: + self.logger.warning( + f"Ollama API gave incorrect document category: {response["message"]["content"]}. Retrying...") break - if not document_type: - document_type = "other" + + # If that fails, just use regular OCR read the title as a dirty fix/fallback + except Exception as e: + self.logger.warning(f"Error! {e}") + self.logger.warning( + "Ollama OCR offloading failed. Falling back to default OCR") + lines = text.split("\n") + + for line in lines: + if line.strip(): + document_type = line.strip().lower() + break + + if not document_type: + document_type = "other" metadata += text