mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-04-20 08:51:31 +08:00
Compare commits
3 commits
9289166c0e
...
844113d44f
Author | SHA1 | Date | |
---|---|---|---|
844113d44f | |||
d81319c8ec | |||
f39f5966d6 |
8 changed files with 143 additions and 34 deletions
|
@ -21,3 +21,10 @@ ADMIN_PASSWORD = ''
|
||||||
|
|
||||||
# To insert test data or not (UNUSED)
|
# To insert test data or not (UNUSED)
|
||||||
TEST_DATA = "True"
|
TEST_DATA = "True"
|
||||||
|
|
||||||
|
# Ollama for Categorization
|
||||||
|
OLLAMA_URL = "localhost:11434"
|
||||||
|
OLLAMA_USE_AUTH = "False"
|
||||||
|
OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16"
|
||||||
|
OLLAMA_USERNAME = ""
|
||||||
|
OLLAMA_PASSWORD = ""
|
|
@ -8,7 +8,8 @@ class IsStaff(BasePermission):
|
||||||
|
|
||||||
def has_permission(self, request, view):
|
def has_permission(self, request, view):
|
||||||
return bool(
|
return bool(
|
||||||
request.user and request.user.role in ("head", "admin", "planning", "staff")
|
request.user and request.user.role in (
|
||||||
|
"head", "admin", "planning", "staff")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,7 +19,7 @@ class IsPlanning(BasePermission):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def has_permission(self, request, view):
|
def has_permission(self, request, view):
|
||||||
return bool(request.user and request.user.role in ("head", "admin", "planning"))
|
return bool(request.user and request.user.role == "planning")
|
||||||
|
|
||||||
|
|
||||||
class IsHead(BasePermission):
|
class IsHead(BasePermission):
|
||||||
|
|
|
@ -6,7 +6,7 @@ from rest_framework.settings import api_settings
|
||||||
|
|
||||||
|
|
||||||
class CustomUserSerializer(serializers.ModelSerializer):
|
class CustomUserSerializer(serializers.ModelSerializer):
|
||||||
birthday = serializers.DateField(format="%m-%d-%Y")
|
birthday = serializers.DateField(format="%Y-%m-%d")
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
model = CustomUser
|
model = CustomUser
|
||||||
|
|
|
@ -10,19 +10,48 @@ from .models import CustomUser
|
||||||
def create_admin_user(sender, **kwargs):
|
def create_admin_user(sender, **kwargs):
|
||||||
# Programatically creates the administrator account
|
# Programatically creates the administrator account
|
||||||
if sender.name == "accounts":
|
if sender.name == "accounts":
|
||||||
ADMIN_USER = CustomUser.objects.filter(
|
users = [{
|
||||||
email=get_secret("ADMIN_EMAIL")).first()
|
"email": get_secret("ADMIN_EMAIL"),
|
||||||
if not ADMIN_USER:
|
"role": "head",
|
||||||
ADMIN_USER = CustomUser.objects.create_superuser(
|
"admin": True,
|
||||||
username=get_secret("ADMIN_EMAIL"),
|
}, {
|
||||||
email=get_secret("ADMIN_EMAIL"),
|
"email": "staff@test.com",
|
||||||
|
"role": "staff",
|
||||||
|
"admin": False,
|
||||||
|
}, {
|
||||||
|
"email": "planning@test.com",
|
||||||
|
"role": "planning",
|
||||||
|
"admin": False,
|
||||||
|
}, {
|
||||||
|
"email": "client@test.com",
|
||||||
|
"role": "client",
|
||||||
|
"admin": False,
|
||||||
|
},]
|
||||||
|
for user in users:
|
||||||
|
USER = CustomUser.objects.filter(
|
||||||
|
email=user["email"]).first()
|
||||||
|
if not USER:
|
||||||
|
if user["admin"]:
|
||||||
|
USER = CustomUser.objects.create_superuser(
|
||||||
|
username=user["email"],
|
||||||
|
email=user["email"],
|
||||||
password=get_secret("ADMIN_PASSWORD"),
|
password=get_secret("ADMIN_PASSWORD"),
|
||||||
sex="male",
|
sex="male",
|
||||||
birthday=localdate(now()),
|
birthday=localdate(now()),
|
||||||
|
role=user["role"]
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
USER = CustomUser.objects.create_user(
|
||||||
|
username=user["email"],
|
||||||
|
email=user["email"],
|
||||||
|
password=get_secret("ADMIN_PASSWORD"),
|
||||||
|
sex="male",
|
||||||
|
birthday=localdate(now()),
|
||||||
|
role=user["role"]
|
||||||
|
|
||||||
print("Created administrator account:", ADMIN_USER.email)
|
)
|
||||||
|
print(f"Created {user['role']} account: {USER.email}")
|
||||||
|
|
||||||
ADMIN_USER.first_name = "Administrator"
|
USER.first_name = f"DEBUG_USER:{USER.email}"
|
||||||
ADMIN_USER.is_active = True
|
USER.is_active = True
|
||||||
ADMIN_USER.save()
|
USER.save()
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
from django.core.management.base import BaseCommand, CommandError
|
from ollama import ChatResponse
|
||||||
|
import base64
|
||||||
|
import httpx
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT
|
||||||
from watchdog.observers import Observer
|
from watchdog.observers import Observer
|
||||||
from watchdog.events import FileSystemEventHandler
|
from watchdog.events import FileSystemEventHandler
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
from config.settings import get_secret
|
||||||
from django.core.files import File
|
from django.core.files import File
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from ollama import Client
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(FileSystemEventHandler):
|
class PDFHandler(FileSystemEventHandler):
|
||||||
|
@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
|
|
||||||
def process_pdf(self, file_path):
|
def process_pdf(self, file_path):
|
||||||
try:
|
try:
|
||||||
filename = os.path.basename(file_path)
|
# Get the original filename and directory
|
||||||
filename = str(filename).replace(" ", "")
|
original_filename = os.path.basename(file_path)
|
||||||
|
original_dir = os.path.dirname(file_path)
|
||||||
|
|
||||||
|
# Check if the filename contains spaces
|
||||||
|
if " " in original_filename:
|
||||||
|
# Create the new filename by replacing spaces
|
||||||
|
new_filename = original_filename.replace(" ", "_")
|
||||||
|
|
||||||
|
# Construct the new full file path
|
||||||
|
new_file_path = os.path.join(original_dir, new_filename)
|
||||||
|
|
||||||
|
# Rename the file
|
||||||
|
os.rename(file_path, new_file_path)
|
||||||
|
|
||||||
|
# Update the filename and file_path variables
|
||||||
|
filename = new_filename
|
||||||
|
file_path = new_file_path
|
||||||
|
else:
|
||||||
|
filename = original_filename
|
||||||
metadata = ""
|
metadata = ""
|
||||||
document_type = ""
|
document_type = ""
|
||||||
|
|
||||||
|
@ -60,12 +83,62 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
# Perform OCR
|
# Perform OCR
|
||||||
text = pytesseract.image_to_string(img).strip()
|
text = pytesseract.image_to_string(img).strip()
|
||||||
|
|
||||||
|
# Get document category
|
||||||
|
# Try to pass image to the Ollama image recognition API first
|
||||||
|
try:
|
||||||
|
client = Client(
|
||||||
|
host=get_secret("OLLAMA_URL"),
|
||||||
|
auth=httpx.BasicAuth(
|
||||||
|
username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None
|
||||||
|
)
|
||||||
|
|
||||||
|
encoded_image = base64.b64encode(
|
||||||
|
img_buffer.getvalue()).decode()
|
||||||
|
|
||||||
|
attempts = 0
|
||||||
|
while True:
|
||||||
|
if attempts >= 3:
|
||||||
|
raise Exception(
|
||||||
|
"Unable to categorize using Ollama API")
|
||||||
|
attempts += 1
|
||||||
|
|
||||||
|
content = f"""
|
||||||
|
Read the text from the image and provide a category.
|
||||||
|
|
||||||
|
Possible categories are: Announcement, Manual, Form
|
||||||
|
|
||||||
|
Respond only with the category. No explanations are necessary.
|
||||||
|
"""
|
||||||
|
|
||||||
|
response: ChatResponse = client.chat(
|
||||||
|
model=get_secret("OLLAMA_MODEL"),
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": content,
|
||||||
|
"images": [encoded_image]},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
document_type = response["message"]["content"].replace(
|
||||||
|
"*", "").replace(".", "")
|
||||||
|
|
||||||
|
# A few safety checks if the model does not follow through with output instructions
|
||||||
|
if len(document_type) > 16:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...")
|
||||||
|
break
|
||||||
|
|
||||||
|
# If that fails, just use regular OCR read the title as a dirty fix/fallback
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error! {e}")
|
||||||
|
self.logger.warning(
|
||||||
|
"Ollama OCR offloading failed. Falling back to default OCR")
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.strip():
|
if line.strip():
|
||||||
document_type = line.strip().lower()
|
document_type = line.strip().lower()
|
||||||
break
|
break
|
||||||
|
|
||||||
if not document_type:
|
if not document_type:
|
||||||
document_type = "other"
|
document_type = "other"
|
||||||
|
|
||||||
|
@ -73,7 +146,7 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
|
|
||||||
# Open the file for instance creation
|
# Open the file for instance creation
|
||||||
DOCUMENT, created = Document.objects.get_or_create(
|
DOCUMENT, created = Document.objects.get_or_create(
|
||||||
name=filename,
|
name=filename.replace(".pdf", ""),
|
||||||
defaults={
|
defaults={
|
||||||
"number_pages": num_pages,
|
"number_pages": num_pages,
|
||||||
"ocr_metadata": metadata,
|
"ocr_metadata": metadata,
|
||||||
|
@ -85,8 +158,7 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
DOCUMENT.file.save(
|
DOCUMENT.file.save(
|
||||||
name=filename, content=File(open(file_path, "rb")))
|
name=filename, content=File(open(file_path, "rb")))
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Document '{filename}' created successfully with type '{
|
f"Document '{filename}' created successfully with type '{document_type}'."
|
||||||
document_type}'."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -8,4 +8,4 @@ from .models import Document
|
||||||
class DocumentAdmin(ModelAdmin):
|
class DocumentAdmin(ModelAdmin):
|
||||||
model = Document
|
model = Document
|
||||||
search_fields = ["id", "name", "document_type"]
|
search_fields = ["id", "name", "document_type"]
|
||||||
list_display = ["id", "name", "document_type"]
|
list_display = ["id", "name", "document_type", "date_uploaded"]
|
||||||
|
|
|
@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView):
|
||||||
|
|
||||||
http_method_names = ["get"]
|
http_method_names = ["get"]
|
||||||
serializer_class = DocumentSerializer
|
serializer_class = DocumentSerializer
|
||||||
queryset = Document.objects.all()
|
queryset = Document.objects.all().order_by("-date_uploaded")
|
||||||
pagination_class = PageNumberPagination
|
pagination_class = PageNumberPagination
|
||||||
permission_classes = [IsAuthenticated]
|
permission_classes = [IsAuthenticated]
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
{% trans 'Please visit the site to check your request:' %}
|
{% trans 'Please visit the site to check your request:' %}
|
||||||
{{ url|safe }}
|
{{ url|safe }}
|
||||||
|
|
||||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
|
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block html_body %}
|
{% block html_body %}
|
||||||
|
@ -27,6 +27,6 @@
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
|
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
|
||||||
</p>
|
</p>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
Loading…
Add table
Reference in a new issue