diff --git a/.env.sample b/.env.sample index 362394d..72470fd 100644 --- a/.env.sample +++ b/.env.sample @@ -20,4 +20,11 @@ ADMIN_EMAIL = 'admin@test.com' ADMIN_PASSWORD = '' # To insert test data or not (UNUSED) -TEST_DATA = "True" \ No newline at end of file +TEST_DATA = "True" + +# Ollama for Categorization +OLLAMA_URL = "localhost:11434" +OLLAMA_USE_AUTH = "False" +OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16" +OLLAMA_USERNAME = "" +OLLAMA_PASSWORD = "" \ No newline at end of file diff --git a/docmanager_backend/accounts/permissions.py b/docmanager_backend/accounts/permissions.py index 5e45e97..e2be857 100644 --- a/docmanager_backend/accounts/permissions.py +++ b/docmanager_backend/accounts/permissions.py @@ -8,7 +8,8 @@ class IsStaff(BasePermission): def has_permission(self, request, view): return bool( - request.user and request.user.role in ("head", "admin", "planning", "staff") + request.user and request.user.role in ( + "head", "admin", "planning", "staff") ) @@ -18,7 +19,7 @@ class IsPlanning(BasePermission): """ def has_permission(self, request, view): - return bool(request.user and request.user.role in ("head", "admin", "planning")) + return bool(request.user and request.user.role == "planning") class IsHead(BasePermission): diff --git a/docmanager_backend/accounts/serializers.py b/docmanager_backend/accounts/serializers.py index 32ede3c..80384a0 100644 --- a/docmanager_backend/accounts/serializers.py +++ b/docmanager_backend/accounts/serializers.py @@ -6,7 +6,7 @@ from rest_framework.settings import api_settings class CustomUserSerializer(serializers.ModelSerializer): - birthday = serializers.DateField(format="%m-%d-%Y") + birthday = serializers.DateField(format="%Y-%m-%d") class Meta: model = CustomUser diff --git a/docmanager_backend/accounts/signals.py b/docmanager_backend/accounts/signals.py index bb53c71..6baddc5 100644 --- a/docmanager_backend/accounts/signals.py +++ b/docmanager_backend/accounts/signals.py @@ -10,19 +10,48 @@ from .models import CustomUser def create_admin_user(sender, **kwargs): # Programatically creates the administrator account if sender.name == "accounts": - ADMIN_USER = CustomUser.objects.filter( - email=get_secret("ADMIN_EMAIL")).first() - if not ADMIN_USER: - ADMIN_USER = CustomUser.objects.create_superuser( - username=get_secret("ADMIN_EMAIL"), - email=get_secret("ADMIN_EMAIL"), - password=get_secret("ADMIN_PASSWORD"), - sex="male", - birthday=localdate(now()), - ) + users = [{ + "email": get_secret("ADMIN_EMAIL"), + "role": "head", + "admin": True, + }, { + "email": "staff@test.com", + "role": "staff", + "admin": False, + }, { + "email": "planning@test.com", + "role": "planning", + "admin": False, + }, { + "email": "client@test.com", + "role": "client", + "admin": False, + },] + for user in users: + USER = CustomUser.objects.filter( + email=user["email"]).first() + if not USER: + if user["admin"]: + USER = CustomUser.objects.create_superuser( + username=user["email"], + email=user["email"], + password=get_secret("ADMIN_PASSWORD"), + sex="male", + birthday=localdate(now()), + role=user["role"] + ) + else: + USER = CustomUser.objects.create_user( + username=user["email"], + email=user["email"], + password=get_secret("ADMIN_PASSWORD"), + sex="male", + birthday=localdate(now()), + role=user["role"] - print("Created administrator account:", ADMIN_USER.email) + ) + print(f"Created {user['role']} account: {USER.email}") - ADMIN_USER.first_name = "Administrator" - ADMIN_USER.is_active = True - ADMIN_USER.save() + USER.first_name = f"DEBUG_USER:{USER.email}" + USER.is_active = True + USER.save() diff --git a/docmanager_backend/config/management/commands/start_watcher.py b/docmanager_backend/config/management/commands/start_watcher.py index 656afcb..2fe93fe 100644 --- a/docmanager_backend/config/management/commands/start_watcher.py +++ b/docmanager_backend/config/management/commands/start_watcher.py @@ -1,4 +1,7 @@ -from django.core.management.base import BaseCommand, CommandError +from ollama import ChatResponse +import base64 +import httpx +from django.core.management.base import BaseCommand from io import BytesIO from documents.models import Document @@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler from documents.models import Document +from config.settings import get_secret from django.core.files import File import logging import time +from ollama import Client class PDFHandler(FileSystemEventHandler): @@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler): def process_pdf(self, file_path): try: - filename = os.path.basename(file_path) - filename = str(filename).replace(" ", "") + # Get the original filename and directory + original_filename = os.path.basename(file_path) + original_dir = os.path.dirname(file_path) + + # Check if the filename contains spaces + if " " in original_filename: + # Create the new filename by replacing spaces + new_filename = original_filename.replace(" ", "_") + + # Construct the new full file path + new_file_path = os.path.join(original_dir, new_filename) + + # Rename the file + os.rename(file_path, new_file_path) + + # Update the filename and file_path variables + filename = new_filename + file_path = new_file_path + else: + filename = original_filename metadata = "" document_type = "" @@ -60,20 +83,70 @@ class PDFHandler(FileSystemEventHandler): # Perform OCR text = pytesseract.image_to_string(img).strip() - lines = text.split("\n") + # Get document category + # Try to pass image to the Ollama image recognition API first + try: + client = Client( + host=get_secret("OLLAMA_URL"), + auth=httpx.BasicAuth( + username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None + ) - for line in lines: - if line.strip(): - document_type = line.strip().lower() + encoded_image = base64.b64encode( + img_buffer.getvalue()).decode() + + attempts = 0 + while True: + if attempts >= 3: + raise Exception( + "Unable to categorize using Ollama API") + attempts += 1 + + content = f""" + Read the text from the image and provide a category. + + Possible categories are: Announcement, Manual, Form + + Respond only with the category. No explanations are necessary. + """ + + response: ChatResponse = client.chat( + model=get_secret("OLLAMA_MODEL"), + messages=[ + {"role": "user", "content": content, + "images": [encoded_image]}, + ], + ) + + document_type = response["message"]["content"].replace( + "*", "").replace(".", "") + + # A few safety checks if the model does not follow through with output instructions + if len(document_type) > 16: + self.logger.warning( + f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...") break - if not document_type: - document_type = "other" + + # If that fails, just use regular OCR read the title as a dirty fix/fallback + except Exception as e: + self.logger.warning(f"Error! {e}") + self.logger.warning( + "Ollama OCR offloading failed. Falling back to default OCR") + lines = text.split("\n") + + for line in lines: + if line.strip(): + document_type = line.strip().lower() + break + + if not document_type: + document_type = "other" metadata += text # Open the file for instance creation DOCUMENT, created = Document.objects.get_or_create( - name=filename, + name=filename.replace(".pdf", ""), defaults={ "number_pages": num_pages, "ocr_metadata": metadata, @@ -85,8 +158,7 @@ class PDFHandler(FileSystemEventHandler): DOCUMENT.file.save( name=filename, content=File(open(file_path, "rb"))) self.logger.info( - f"Document '{filename}' created successfully with type '{ - document_type}'." + f"Document '{filename}' created successfully with type '{document_type}'." ) else: diff --git a/docmanager_backend/documents/admin.py b/docmanager_backend/documents/admin.py index 4e56f79..356fe3e 100644 --- a/docmanager_backend/documents/admin.py +++ b/docmanager_backend/documents/admin.py @@ -8,4 +8,4 @@ from .models import Document class DocumentAdmin(ModelAdmin): model = Document search_fields = ["id", "name", "document_type"] - list_display = ["id", "name", "document_type"] + list_display = ["id", "name", "document_type", "date_uploaded"] diff --git a/docmanager_backend/documents/views.py b/docmanager_backend/documents/views.py index 0bcb527..9b7d1f1 100644 --- a/docmanager_backend/documents/views.py +++ b/docmanager_backend/documents/views.py @@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView): http_method_names = ["get"] serializer_class = DocumentSerializer - queryset = Document.objects.all() + queryset = Document.objects.all().order_by("-date_uploaded") pagination_class = PageNumberPagination permission_classes = [IsAuthenticated] diff --git a/docmanager_backend/emails/templates/request_approved.html b/docmanager_backend/emails/templates/request_approved.html index 783c865..2b74432 100644 --- a/docmanager_backend/emails/templates/request_approved.html +++ b/docmanager_backend/emails/templates/request_approved.html @@ -10,7 +10,7 @@ {% trans 'Please visit the site to check your request:' %} {{ url|safe }} - {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %} + {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %} {% endblock %} {% block html_body %} @@ -27,6 +27,6 @@

- {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %} + {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}

{% endblock %}