From f39f5966d6d0b771cca59eb31454148bdbb70b48 Mon Sep 17 00:00:00 2001 From: Keannu Bernasol Date: Sat, 7 Dec 2024 02:44:45 +0800 Subject: [PATCH 1/3] Add Ollama integration --- .env.sample | 7 +- .../management/commands/start_watcher.py | 89 +++++++++++++++++-- 2 files changed, 87 insertions(+), 9 deletions(-) diff --git a/.env.sample b/.env.sample index 362394d..69f9163 100644 --- a/.env.sample +++ b/.env.sample @@ -20,4 +20,9 @@ ADMIN_EMAIL = 'admin@test.com' ADMIN_PASSWORD = '' # To insert test data or not (UNUSED) -TEST_DATA = "True" \ No newline at end of file +TEST_DATA = "True" + +# Ollama for Categorization +OLLAMA_URL = "" +OLLAMA_USERNAME = "" +OLLAMA_PASSWORD = "" \ No newline at end of file diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py index 656afcb..0eb0998 100644 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -1,3 +1,6 @@ +from ollama import ChatResponse +import base64 +import httpx from django.core.management.base import BaseCommand, CommandError from io import BytesIO @@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler from documents.models import Document +from config.settings import get_secret from django.core.files import File import logging import time +from ollama import Client class PDFHandler(FileSystemEventHandler): @@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler): def process_pdf(self, file_path): try: - filename = os.path.basename(file_path) - filename = str(filename).replace(" ", "") + # Get the original filename and directory + original_filename = os.path.basename(file_path) + original_dir = os.path.dirname(file_path) + + # Check if the filename contains spaces + if " " in original_filename: + # Create the new filename by replacing spaces + new_filename = original_filename.replace(" ", "_") + + # Construct the new full file path + new_file_path = os.path.join(original_dir, new_filename) + + # Rename the file + os.rename(file_path, new_file_path) + + # Update the filename and file_path variables + filename = new_filename + file_path = new_file_path + else: + filename = original_filename metadata = "" document_type = "" @@ -60,14 +83,64 @@ class PDFHandler(FileSystemEventHandler): # Perform OCR text = pytesseract.image_to_string(img).strip() - lines = text.split("\n") + # Get document category + # Try to pass image to the Ollama image recognition API first + try: + client = Client( + host=get_secret("OLLAMA_URL"), + auth=httpx.BasicAuth( + username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) + ) - for line in lines: - if line.strip(): - document_type = line.strip().lower() + encoded_image = base64.b64encode( + img_buffer.getvalue()).decode() + + attempts = 0 + while True: + if attempts >= 3: + raise Exception( + "Unable to categorize using Ollama API") + attempts += 1 + + content = f""" + Read the text from the image and provide a category. + + Possible categories are: Announcement, Manual, Form + + Respond only with the category. No explanations are necessary. + """ + + response: ChatResponse = client.chat( + model="llama3.2-vision", + messages=[ + {"role": "user", "content": content, + "images": [encoded_image]}, + ], + ) + + document_type = response["message"]["content"].split(":")[ + 0].replace("*", "").replace(".", "") + + # A few safety checks if the model does not follow through with output instructions + if len(document_type) > 16: + self.logger.warning( + f"Ollama API gave incorrect document category: {response["message"]["content"]}. Retrying...") break - if not document_type: - document_type = "other" + + # If that fails, just use regular OCR read the title as a dirty fix/fallback + except Exception as e: + self.logger.warning(f"Error! {e}") + self.logger.warning( + "Ollama OCR offloading failed. Falling back to default OCR") + lines = text.split("\n") + + for line in lines: + if line.strip(): + document_type = line.strip().lower() + break + + if not document_type: + document_type = "other" metadata += text From d81319c8ec303bbc1a2c51e702a5f90cbc4c387a Mon Sep 17 00:00:00 2001 From: Keannu Bernasol Date: Sat, 7 Dec 2024 14:03:17 +0800 Subject: [PATCH 2/3] Update Ollama API to use different model --- .env.sample | 4 +++- .../config/management/commands/start_watcher.py | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.env.sample b/.env.sample index 69f9163..72470fd 100644 --- a/.env.sample +++ b/.env.sample @@ -23,6 +23,8 @@ ADMIN_PASSWORD = '' TEST_DATA = "True" # Ollama for Categorization -OLLAMA_URL = "" +OLLAMA_URL = "localhost:11434" +OLLAMA_USE_AUTH = "False" +OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16" OLLAMA_USERNAME = "" OLLAMA_PASSWORD = "" \ No newline at end of file diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py index 0eb0998..0bc3e79 100644 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -1,7 +1,7 @@ from ollama import ChatResponse import base64 import httpx -from django.core.management.base import BaseCommand, CommandError +from django.core.management.base import BaseCommand from io import BytesIO from documents.models import Document @@ -89,7 +89,7 @@ class PDFHandler(FileSystemEventHandler): client = Client( host=get_secret("OLLAMA_URL"), auth=httpx.BasicAuth( - username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) + username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None ) encoded_image = base64.b64encode( @@ -111,15 +111,15 @@ class PDFHandler(FileSystemEventHandler): """ response: ChatResponse = client.chat( - model="llama3.2-vision", + model=get_secret("OLLAMA_MODEL"), messages=[ {"role": "user", "content": content, "images": [encoded_image]}, ], ) - document_type = response["message"]["content"].split(":")[ - 0].replace("*", "").replace(".", "") + document_type = response["message"]["content"].replace( + "*", "").replace(".", "") # A few safety checks if the model does not follow through with output instructions if len(document_type) > 16: From 844113d44f9f43dd6e5a2b891ca1dc6dde982836 Mon Sep 17 00:00:00 2001 From: Keannu Bernasol Date: Mon, 16 Dec 2024 14:58:50 +0800 Subject: [PATCH 3/3] Add in requested changes --- docmanager_backend/accounts/permissions.py | 5 +- docmanager_backend/accounts/serializers.py | 2 +- docmanager_backend/accounts/signals.py | 57 ++++++++++++++----- .../management/commands/start_watcher.py | 7 +-- docmanager_backend/documents/admin.py | 2 +- docmanager_backend/documents/views.py | 2 +- .../emails/templates/request_approved.html | 4 +- 7 files changed, 54 insertions(+), 25 deletions(-) diff --git a/docmanager_backend/accounts/permissions.py b/docmanager_backend/accounts/permissions.py index 5e45e97..e2be857 100644 --- a/docmanager_backend/accounts/permissions.py +++ b/docmanager_backend/accounts/permissions.py @@ -8,7 +8,8 @@ class IsStaff(BasePermission): def has_permission(self, request, view): return bool( - request.user and request.user.role in ("head", "admin", "planning", "staff") + request.user and request.user.role in ( + "head", "admin", "planning", "staff") ) @@ -18,7 +19,7 @@ class IsPlanning(BasePermission): """ def has_permission(self, request, view): - return bool(request.user and request.user.role in ("head", "admin", "planning")) + return bool(request.user and request.user.role == "planning") class IsHead(BasePermission): diff --git a/docmanager_backend/accounts/serializers.py b/docmanager_backend/accounts/serializers.py index 32ede3c..80384a0 100644 --- a/docmanager_backend/accounts/serializers.py +++ b/docmanager_backend/accounts/serializers.py @@ -6,7 +6,7 @@ from rest_framework.settings import api_settings class CustomUserSerializer(serializers.ModelSerializer): - birthday = serializers.DateField(format="%m-%d-%Y") + birthday = serializers.DateField(format="%Y-%m-%d") class Meta: model = CustomUser diff --git a/docmanager_backend/accounts/signals.py b/docmanager_backend/accounts/signals.py index bb53c71..6baddc5 100644 --- a/docmanager_backend/accounts/signals.py +++ b/docmanager_backend/accounts/signals.py @@ -10,19 +10,48 @@ from .models import CustomUser def create_admin_user(sender, **kwargs): # Programatically creates the administrator account if sender.name == "accounts": - ADMIN_USER = CustomUser.objects.filter( - email=get_secret("ADMIN_EMAIL")).first() - if not ADMIN_USER: - ADMIN_USER = CustomUser.objects.create_superuser( - username=get_secret("ADMIN_EMAIL"), - email=get_secret("ADMIN_EMAIL"), - password=get_secret("ADMIN_PASSWORD"), - sex="male", - birthday=localdate(now()), - ) + users = [{ + "email": get_secret("ADMIN_EMAIL"), + "role": "head", + "admin": True, + }, { + "email": "staff@test.com", + "role": "staff", + "admin": False, + }, { + "email": "planning@test.com", + "role": "planning", + "admin": False, + }, { + "email": "client@test.com", + "role": "client", + "admin": False, + },] + for user in users: + USER = CustomUser.objects.filter( + email=user["email"]).first() + if not USER: + if user["admin"]: + USER = CustomUser.objects.create_superuser( + username=user["email"], + email=user["email"], + password=get_secret("ADMIN_PASSWORD"), + sex="male", + birthday=localdate(now()), + role=user["role"] + ) + else: + USER = CustomUser.objects.create_user( + username=user["email"], + email=user["email"], + password=get_secret("ADMIN_PASSWORD"), + sex="male", + birthday=localdate(now()), + role=user["role"] - print("Created administrator account:", ADMIN_USER.email) + ) + print(f"Created {user['role']} account: {USER.email}") - ADMIN_USER.first_name = "Administrator" - ADMIN_USER.is_active = True - ADMIN_USER.save() + USER.first_name = f"DEBUG_USER:{USER.email}" + USER.is_active = True + USER.save() diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py index 0bc3e79..2fe93fe 100644 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -124,7 +124,7 @@ class PDFHandler(FileSystemEventHandler): # A few safety checks if the model does not follow through with output instructions if len(document_type) > 16: self.logger.warning( - f"Ollama API gave incorrect document category: {response["message"]["content"]}. Retrying...") + f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...") break # If that fails, just use regular OCR read the title as a dirty fix/fallback @@ -146,7 +146,7 @@ class PDFHandler(FileSystemEventHandler): # Open the file for instance creation DOCUMENT, created = Document.objects.get_or_create( - name=filename, + name=filename.replace(".pdf", ""), defaults={ "number_pages": num_pages, "ocr_metadata": metadata, @@ -158,8 +158,7 @@ class PDFHandler(FileSystemEventHandler): DOCUMENT.file.save( name=filename, content=File(open(file_path, "rb"))) self.logger.info( - f"Document '{filename}' created successfully with type '{ - document_type}'." + f"Document '{filename}' created successfully with type '{document_type}'." ) else: diff --git a/docmanager_backend/documents/admin.py b/docmanager_backend/documents/admin.py index 4e56f79..356fe3e 100644 --- a/docmanager_backend/documents/admin.py +++ b/docmanager_backend/documents/admin.py @@ -8,4 +8,4 @@ from .models import Document class DocumentAdmin(ModelAdmin): model = Document search_fields = ["id", "name", "document_type"] - list_display = ["id", "name", "document_type"] + list_display = ["id", "name", "document_type", "date_uploaded"] diff --git a/docmanager_backend/documents/views.py b/docmanager_backend/documents/views.py index 0bcb527..9b7d1f1 100644 --- a/docmanager_backend/documents/views.py +++ b/docmanager_backend/documents/views.py @@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView): http_method_names = ["get"] serializer_class = DocumentSerializer - queryset = Document.objects.all() + queryset = Document.objects.all().order_by("-date_uploaded") pagination_class = PageNumberPagination permission_classes = [IsAuthenticated] diff --git a/docmanager_backend/emails/templates/request_approved.html b/docmanager_backend/emails/templates/request_approved.html index 783c865..2b74432 100644 --- a/docmanager_backend/emails/templates/request_approved.html +++ b/docmanager_backend/emails/templates/request_approved.html @@ -10,7 +10,7 @@ {% trans 'Please visit the site to check your request:' %} {{ url|safe }} - {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %} + {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %} {% endblock %} {% block html_body %} @@ -27,6 +27,6 @@

- {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %} + {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}

{% endblock %}