""" User Document Model - File uploads for RAG (Retrieval Augmented Generation). """ from django.db import models from django.conf import settings from django.utils.translation import gettext_lazy as _ from core.models import TimestampedModel import os class UserDocument(TimestampedModel): """ Track user-uploaded documents for RAG. The actual embeddings are stored in pgvector (PGVECTOR_CONNECTION_STRING). This model stores file metadata and references to vector store. """ # User and session user = models.ForeignKey( settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="documents", help_text=_("User who uploaded this document"), ) chat_session = models.ForeignKey( "chatbot.ChatSession", on_delete=models.SET_NULL, null=True, blank=True, related_name="documents", help_text=_("Chat session this document is associated with"), ) # File information file = models.FileField( upload_to="user_documents/%Y/%m/%d/", help_text=_("Uploaded document file") ) file_name = models.CharField(max_length=255, help_text=_("Original filename")) file_size = models.BigIntegerField(help_text=_("File size in bytes")) file_type = models.CharField(max_length=100, help_text=_("MIME type of the file")) file_extension = models.CharField( max_length=10, help_text=_("File extension (e.g., .pdf, .docx)") ) # Processing status processing_status = models.CharField( max_length=20, default="pending", choices=[ ("pending", "Pending Processing"), ("processing", "Processing"), ("completed", "Completed"), ("failed", "Failed"), ], help_text=_("Document processing status"), ) processed_at = models.DateTimeField( null=True, blank=True, help_text=_("When processing completed") ) # Vector store references - REQUIRED for pgvector vector_collection_name = models.CharField( max_length=255, blank=True, null=True, db_index=True, help_text=_( "PGVector collection name where embeddings are stored (REQUIRED for vector operations)" ), ) vector_collection_metadata = models.JSONField( default=dict, blank=True, help_text=_("Optional metadata for the PGVector collection itself"), ) vector_store_ids = models.JSONField( default=list, blank=True, help_text=_("List of pgvector document IDs for this file's chunks"), ) chunk_count = models.IntegerField( default=0, help_text=_("Number of chunks/embeddings created") ) # Document metadata title = models.CharField( max_length=255, blank=True, null=True, help_text=_("User-defined or extracted document title"), ) description = models.TextField( blank=True, null=True, help_text=_("User description or summary of document") ) tags = models.JSONField( default=list, blank=True, help_text=_("User-defined tags for organization") ) # Extracted content metadata (file-level) extracted_metadata = models.JSONField( default=dict, blank=True, help_text=_("Metadata extracted from document (author, date, pages, etc.)"), ) # Vector metadata - stored with each chunk in pgvector for filtering vector_metadata = models.JSONField( default=dict, blank=True, help_text=_( "Searchable metadata for pgvector filtering (e.g., {'user_id': '123', 'category': 'research', 'date': '2025-01'})" ), ) page_count = models.IntegerField( null=True, blank=True, help_text=_("Number of pages (for PDFs, documents)") ) word_count = models.IntegerField( null=True, blank=True, help_text=_("Approximate word count") ) # Visibility and access is_active = models.BooleanField( default=True, help_text=_("Whether this document is active and searchable") ) is_shared = models.BooleanField( default=False, help_text=_("Whether document is shared with other users") ) share_settings = models.JSONField( default=dict, blank=True, help_text=_("Document sharing configuration") ) # Error tracking processing_error = models.TextField( blank=True, null=True, help_text=_("Error message if processing failed") ) retry_count = models.IntegerField( default=0, help_text=_("Number of processing retries") ) class Meta: verbose_name = _("User Document") verbose_name_plural = _("User Documents") ordering = ["-created_at"] indexes = [ models.Index(fields=["user", "-created_at"], name="userdoc_user_date_idx"), models.Index(fields=["user", "is_active"], name="userdoc_user_active_idx"), models.Index(fields=["processing_status"], name="userdoc_status_idx"), models.Index(fields=["file_type"], name="userdoc_type_idx"), models.Index( fields=["vector_collection_name"], name="userdoc_collection_idx" ), models.Index( fields=["user", "vector_collection_name"], name="userdoc_user_collection_idx", ), ] def __str__(self): return f"{self.file_name} ({self.user.email})" def save(self, *args, **kwargs): """Extract file metadata on save.""" if self.file: # Extract filename and extension if not self.file_name: self.file_name = os.path.basename(self.file.name) if not self.file_extension: self.file_extension = os.path.splitext(self.file_name)[1].lower() # Get file size if hasattr(self.file, "size"): self.file_size = self.file.size super().save(*args, **kwargs) def mark_processing_started(self): """Mark document as processing.""" self.processing_status = "processing" self.save(update_fields=["processing_status"]) def mark_processing_completed( self, collection_name, vector_ids, chunk_count, collection_metadata=None, vector_metadata=None, ): """ Mark document processing as completed. Args: collection_name: PGVector collection name (REQUIRED) vector_ids: List of document IDs in pgvector chunk_count: Number of chunks created collection_metadata: Optional metadata for the collection vector_metadata: Metadata to be stored with each chunk for filtering """ from django.utils import timezone self.processing_status = "completed" self.processed_at = timezone.now() self.vector_collection_name = collection_name self.vector_store_ids = vector_ids self.chunk_count = chunk_count if collection_metadata: self.vector_collection_metadata = collection_metadata if vector_metadata: self.vector_metadata = vector_metadata self.save( update_fields=[ "processing_status", "processed_at", "vector_collection_name", "vector_store_ids", "chunk_count", "vector_collection_metadata", "vector_metadata", ] ) def mark_processing_failed(self, error_message): """Mark document processing as failed.""" self.processing_status = "failed" self.processing_error = error_message self.retry_count += 1 self.save( update_fields=["processing_status", "processing_error", "retry_count"] ) def get_vector_metadata(self): """ Get metadata dict to be stored with vector embeddings. Returns: dict: Metadata for pgvector filtering """ # Combine vector_metadata with essential fields metadata = { "user_id": str(self.user.id), "document_id": str(self.id), "file_name": self.file_name, "file_type": self.file_type, "upload_date": self.created_at.isoformat(), } # Add user tags if present if self.tags: metadata["tags"] = self.tags # Add session if present if self.chat_session: metadata["session_id"] = str(self.chat_session.id) # Merge with custom vector_metadata if self.vector_metadata: metadata.update(self.vector_metadata) return metadata @property def file_size_mb(self): """Get file size in MB.""" if self.file_size: return round(self.file_size / (1024 * 1024), 2) return 0 @property def is_processable(self): """Check if document can be processed for RAG.""" processable_types = [ "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # .docx "application/msword", # .doc "text/plain", "text/markdown", "text/csv", ] return self.file_type in processable_types @property def has_embeddings(self): """Check if document has been processed and has embeddings.""" return bool( self.processing_status == "completed" and self.vector_collection_name and self.vector_store_ids ) @classmethod def get_user_storage_usage(cls, user): """Get total storage used by user's documents.""" usage = cls.objects.filter(user=user, is_active=True).aggregate( total_size=models.Sum("file_size"), total_documents=models.Count("id"), total_chunks=models.Sum("chunk_count"), ) return { "total_size_bytes": usage["total_size"] or 0, "total_size_mb": round((usage["total_size"] or 0) / (1024 * 1024), 2), "total_documents": usage["total_documents"] or 0, "total_chunks": usage["total_chunks"] or 0, } @classmethod def get_documents_in_collection(cls, collection_name, user=None): """ Get all documents in a specific PGVector collection. Args: collection_name: Name of the pgvector collection user: Optional user filter Returns: QuerySet: Documents in the collection """ queryset = cls.objects.filter( vector_collection_name=collection_name, processing_status="completed" ) if user: queryset = queryset.filter(user=user) return queryset