Spaces:
Sleeping
Sleeping
File size: 1,804 Bytes
0ec978f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import os
import logging
import shutil
from services.document_processor import DocumentProcessor
from services.vector_store import VectorStore
from config.settings import Settings
logging.basicConfig(level=logging.INFO, format='%(asctime)s- %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def main():
logger.info("Starting document ingestion process")
if os.path.exists(Settings.CHROMA_PERSIST_DIR):
logger.warning(f"Removing existing database at {Settings.CHROMA_PERSIST_DIR}")
shutil.rmtree(Settings.CHROMA_PERSIST_DIR)
processor = DocumentProcessor()
vector_store = VectorStore()
if not os.path.exists(Settings.DATA_DIR):
os.makedirs(Settings.DATA_DIR)
logger.info(f"Data directory '{Settings.DATA_DIR}' not found. Created it. Please add PDFs there.")
return
pdf_files = [f for f in os.listdir(Settings.DATA_DIR) if f.endswith('.pdf')]
if not pdf_files:
logger.warning("No PDF files found in the data directory. Please add PDFs there.")
return
total_chunks = 0
for pdf_file in pdf_files:
file_path = os.path.join(Settings.DATA_DIR, pdf_file)
logger.info(f"Processing: {pdf_file}...")
try:
chunks = processor.process_pdf(file_path)
for chunk in chunks:
chunk.metadata['source'] = pdf_file
vector_store.add_documents(chunks)
total_chunks += len(chunks)
logger.info(f"Processed {len(chunks)} chunks from {pdf_file}")
except Exception as e:
logger .error(f"Failed to process {pdf_file}: {str(e)}")
continue
logger.info(f"Ingestion Completed. Total chunks stored: {total_chunks}")
if __name__ == "__main__":
main()
|