File size: 1,804 Bytes
0ec978f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os 
import logging
import shutil
from services.document_processor import DocumentProcessor
from services.vector_store import VectorStore
from config.settings import Settings

logging.basicConfig(level=logging.INFO, format='%(asctime)s- %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def main():
    logger.info("Starting document ingestion process")

    if os.path.exists(Settings.CHROMA_PERSIST_DIR):
        logger.warning(f"Removing existing database at {Settings.CHROMA_PERSIST_DIR}")
        shutil.rmtree(Settings.CHROMA_PERSIST_DIR)
    
    processor = DocumentProcessor()
    vector_store = VectorStore()
    
    if not os.path.exists(Settings.DATA_DIR):
        os.makedirs(Settings.DATA_DIR)
        logger.info(f"Data directory '{Settings.DATA_DIR}' not found. Created it. Please add PDFs there.")
        return

    pdf_files = [f for f in os.listdir(Settings.DATA_DIR) if f.endswith('.pdf')]

    if not pdf_files:
        logger.warning("No PDF files found in the data directory. Please add PDFs there.")
        return

    total_chunks = 0

    for pdf_file in pdf_files:
        file_path = os.path.join(Settings.DATA_DIR, pdf_file)
        logger.info(f"Processing: {pdf_file}...")

        try:
            chunks = processor.process_pdf(file_path)

            for chunk in chunks:
                chunk.metadata['source'] = pdf_file
            
            vector_store.add_documents(chunks)
            total_chunks += len(chunks)
            logger.info(f"Processed {len(chunks)} chunks from {pdf_file}")

        except Exception as e:
            logger .error(f"Failed to process {pdf_file}: {str(e)}")
            continue

    logger.info(f"Ingestion Completed. Total chunks stored: {total_chunks}")

if __name__ == "__main__":
    main()