#!/usr/bin/env python

# index.py - given a few configurations, index content against a large language model

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public Liciense

# January 2, 2024 - first cut, but really hacked upon for the past couple of weeks


# configure
CORPUS  = 'corpus'
STORAGE = 'index'

# require
from llama_index             import VectorStoreIndex, SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from pathlib                 import Path

# initialize
parser  = SimpleNodeParser()
storage = Path( STORAGE )

# make sane
storage.mkdir( exist_ok=True ) 

# create a list of Llama Index documents, index them, and save; the magic happens here
documents = SimpleDirectoryReader( CORPUS ).load_data()
nodes     = parser.get_nodes_from_documents( documents )
index     = VectorStoreIndex( nodes)
index.storage_context.persist( persist_dir=storage )

# done
exit()