mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-04-12 04:51:30 +08:00
Compare commits
3 commits
9289166c0e
...
844113d44f
Author | SHA1 | Date | |
---|---|---|---|
844113d44f | |||
d81319c8ec | |||
f39f5966d6 |
8 changed files with 143 additions and 34 deletions
|
@ -20,4 +20,11 @@ ADMIN_EMAIL = 'admin@test.com'
|
|||
ADMIN_PASSWORD = ''
|
||||
|
||||
# To insert test data or not (UNUSED)
|
||||
TEST_DATA = "True"
|
||||
TEST_DATA = "True"
|
||||
|
||||
# Ollama for Categorization
|
||||
OLLAMA_URL = "localhost:11434"
|
||||
OLLAMA_USE_AUTH = "False"
|
||||
OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16"
|
||||
OLLAMA_USERNAME = ""
|
||||
OLLAMA_PASSWORD = ""
|
|
@ -8,7 +8,8 @@ class IsStaff(BasePermission):
|
|||
|
||||
def has_permission(self, request, view):
|
||||
return bool(
|
||||
request.user and request.user.role in ("head", "admin", "planning", "staff")
|
||||
request.user and request.user.role in (
|
||||
"head", "admin", "planning", "staff")
|
||||
)
|
||||
|
||||
|
||||
|
@ -18,7 +19,7 @@ class IsPlanning(BasePermission):
|
|||
"""
|
||||
|
||||
def has_permission(self, request, view):
|
||||
return bool(request.user and request.user.role in ("head", "admin", "planning"))
|
||||
return bool(request.user and request.user.role == "planning")
|
||||
|
||||
|
||||
class IsHead(BasePermission):
|
||||
|
|
|
@ -6,7 +6,7 @@ from rest_framework.settings import api_settings
|
|||
|
||||
|
||||
class CustomUserSerializer(serializers.ModelSerializer):
|
||||
birthday = serializers.DateField(format="%m-%d-%Y")
|
||||
birthday = serializers.DateField(format="%Y-%m-%d")
|
||||
|
||||
class Meta:
|
||||
model = CustomUser
|
||||
|
|
|
@ -10,19 +10,48 @@ from .models import CustomUser
|
|||
def create_admin_user(sender, **kwargs):
|
||||
# Programatically creates the administrator account
|
||||
if sender.name == "accounts":
|
||||
ADMIN_USER = CustomUser.objects.filter(
|
||||
email=get_secret("ADMIN_EMAIL")).first()
|
||||
if not ADMIN_USER:
|
||||
ADMIN_USER = CustomUser.objects.create_superuser(
|
||||
username=get_secret("ADMIN_EMAIL"),
|
||||
email=get_secret("ADMIN_EMAIL"),
|
||||
password=get_secret("ADMIN_PASSWORD"),
|
||||
sex="male",
|
||||
birthday=localdate(now()),
|
||||
)
|
||||
users = [{
|
||||
"email": get_secret("ADMIN_EMAIL"),
|
||||
"role": "head",
|
||||
"admin": True,
|
||||
}, {
|
||||
"email": "staff@test.com",
|
||||
"role": "staff",
|
||||
"admin": False,
|
||||
}, {
|
||||
"email": "planning@test.com",
|
||||
"role": "planning",
|
||||
"admin": False,
|
||||
}, {
|
||||
"email": "client@test.com",
|
||||
"role": "client",
|
||||
"admin": False,
|
||||
},]
|
||||
for user in users:
|
||||
USER = CustomUser.objects.filter(
|
||||
email=user["email"]).first()
|
||||
if not USER:
|
||||
if user["admin"]:
|
||||
USER = CustomUser.objects.create_superuser(
|
||||
username=user["email"],
|
||||
email=user["email"],
|
||||
password=get_secret("ADMIN_PASSWORD"),
|
||||
sex="male",
|
||||
birthday=localdate(now()),
|
||||
role=user["role"]
|
||||
)
|
||||
else:
|
||||
USER = CustomUser.objects.create_user(
|
||||
username=user["email"],
|
||||
email=user["email"],
|
||||
password=get_secret("ADMIN_PASSWORD"),
|
||||
sex="male",
|
||||
birthday=localdate(now()),
|
||||
role=user["role"]
|
||||
|
||||
print("Created administrator account:", ADMIN_USER.email)
|
||||
)
|
||||
print(f"Created {user['role']} account: {USER.email}")
|
||||
|
||||
ADMIN_USER.first_name = "Administrator"
|
||||
ADMIN_USER.is_active = True
|
||||
ADMIN_USER.save()
|
||||
USER.first_name = f"DEBUG_USER:{USER.email}"
|
||||
USER.is_active = True
|
||||
USER.save()
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from django.core.management.base import BaseCommand, CommandError
|
||||
from ollama import ChatResponse
|
||||
import base64
|
||||
import httpx
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from io import BytesIO
|
||||
from documents.models import Document
|
||||
|
@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT
|
|||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from documents.models import Document
|
||||
from config.settings import get_secret
|
||||
from django.core.files import File
|
||||
import logging
|
||||
import time
|
||||
from ollama import Client
|
||||
|
||||
|
||||
class PDFHandler(FileSystemEventHandler):
|
||||
|
@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler):
|
|||
|
||||
def process_pdf(self, file_path):
|
||||
try:
|
||||
filename = os.path.basename(file_path)
|
||||
filename = str(filename).replace(" ", "")
|
||||
# Get the original filename and directory
|
||||
original_filename = os.path.basename(file_path)
|
||||
original_dir = os.path.dirname(file_path)
|
||||
|
||||
# Check if the filename contains spaces
|
||||
if " " in original_filename:
|
||||
# Create the new filename by replacing spaces
|
||||
new_filename = original_filename.replace(" ", "_")
|
||||
|
||||
# Construct the new full file path
|
||||
new_file_path = os.path.join(original_dir, new_filename)
|
||||
|
||||
# Rename the file
|
||||
os.rename(file_path, new_file_path)
|
||||
|
||||
# Update the filename and file_path variables
|
||||
filename = new_filename
|
||||
file_path = new_file_path
|
||||
else:
|
||||
filename = original_filename
|
||||
metadata = ""
|
||||
document_type = ""
|
||||
|
||||
|
@ -60,20 +83,70 @@ class PDFHandler(FileSystemEventHandler):
|
|||
# Perform OCR
|
||||
text = pytesseract.image_to_string(img).strip()
|
||||
|
||||
lines = text.split("\n")
|
||||
# Get document category
|
||||
# Try to pass image to the Ollama image recognition API first
|
||||
try:
|
||||
client = Client(
|
||||
host=get_secret("OLLAMA_URL"),
|
||||
auth=httpx.BasicAuth(
|
||||
username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None
|
||||
)
|
||||
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
document_type = line.strip().lower()
|
||||
encoded_image = base64.b64encode(
|
||||
img_buffer.getvalue()).decode()
|
||||
|
||||
attempts = 0
|
||||
while True:
|
||||
if attempts >= 3:
|
||||
raise Exception(
|
||||
"Unable to categorize using Ollama API")
|
||||
attempts += 1
|
||||
|
||||
content = f"""
|
||||
Read the text from the image and provide a category.
|
||||
|
||||
Possible categories are: Announcement, Manual, Form
|
||||
|
||||
Respond only with the category. No explanations are necessary.
|
||||
"""
|
||||
|
||||
response: ChatResponse = client.chat(
|
||||
model=get_secret("OLLAMA_MODEL"),
|
||||
messages=[
|
||||
{"role": "user", "content": content,
|
||||
"images": [encoded_image]},
|
||||
],
|
||||
)
|
||||
|
||||
document_type = response["message"]["content"].replace(
|
||||
"*", "").replace(".", "")
|
||||
|
||||
# A few safety checks if the model does not follow through with output instructions
|
||||
if len(document_type) > 16:
|
||||
self.logger.warning(
|
||||
f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...")
|
||||
break
|
||||
if not document_type:
|
||||
document_type = "other"
|
||||
|
||||
# If that fails, just use regular OCR read the title as a dirty fix/fallback
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error! {e}")
|
||||
self.logger.warning(
|
||||
"Ollama OCR offloading failed. Falling back to default OCR")
|
||||
lines = text.split("\n")
|
||||
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
document_type = line.strip().lower()
|
||||
break
|
||||
|
||||
if not document_type:
|
||||
document_type = "other"
|
||||
|
||||
metadata += text
|
||||
|
||||
# Open the file for instance creation
|
||||
DOCUMENT, created = Document.objects.get_or_create(
|
||||
name=filename,
|
||||
name=filename.replace(".pdf", ""),
|
||||
defaults={
|
||||
"number_pages": num_pages,
|
||||
"ocr_metadata": metadata,
|
||||
|
@ -85,8 +158,7 @@ class PDFHandler(FileSystemEventHandler):
|
|||
DOCUMENT.file.save(
|
||||
name=filename, content=File(open(file_path, "rb")))
|
||||
self.logger.info(
|
||||
f"Document '{filename}' created successfully with type '{
|
||||
document_type}'."
|
||||
f"Document '{filename}' created successfully with type '{document_type}'."
|
||||
)
|
||||
|
||||
else:
|
||||
|
|
|
@ -8,4 +8,4 @@ from .models import Document
|
|||
class DocumentAdmin(ModelAdmin):
|
||||
model = Document
|
||||
search_fields = ["id", "name", "document_type"]
|
||||
list_display = ["id", "name", "document_type"]
|
||||
list_display = ["id", "name", "document_type", "date_uploaded"]
|
||||
|
|
|
@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView):
|
|||
|
||||
http_method_names = ["get"]
|
||||
serializer_class = DocumentSerializer
|
||||
queryset = Document.objects.all()
|
||||
queryset = Document.objects.all().order_by("-date_uploaded")
|
||||
pagination_class = PageNumberPagination
|
||||
permission_classes = [IsAuthenticated]
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
{% trans 'Please visit the site to check your request:' %}
|
||||
{{ url|safe }}
|
||||
|
||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
|
||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
|
||||
{% endblock %}
|
||||
|
||||
{% block html_body %}
|
||||
|
@ -27,6 +27,6 @@
|
|||
</p>
|
||||
|
||||
<p>
|
||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
|
||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
|
||||
</p>
|
||||
{% endblock %}
|
||||
|
|
Loading…
Add table
Reference in a new issue