#!/usr/bin/env python

# ask-me-anything.py - given a question, output list of matching-esque questions as well as their answers
# see: https://stackoverflow.com/questions/64792776/cosine-similarity-between-string-and-list-of-strings

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# October 23, 2023 - first investigations
# October 25, 2023 - started applying the process to a carrel as well as returning answers


# configure
BULLET          = '  *'
PROMPT          = '\nAsk me anything: '
SALUTATION      = '\nOkay, bye bye, and thank you.'
PATTERN         = '*.csv'
THRESHOLDSMALL  = 0.3
THRESHOLDMEDIUM = 0.5
THRESHOLDLARGE  = 0.7

# require
from pandas                          import DataFrame, read_csv, concat
from pathlib                         import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise        import cosine_similarity
from sys                             import argv, exit

# get input
if len( argv ) != 2 : exit( "Usage: " + argv[ 0 ] + " <carrel>" )
carrel = argv[ 1 ]

# process each model; get lists of questions, answers, and scores; very tricky
qanda     = concat( [ read_csv( model ) for model in ( Path( carrel ) ).glob( PATTERN ) ] )
questions = list( qanda[ 'question' ] )
answers   = list( qanda[ 'answer' ] )
scores    = list( qanda[ 'score' ] )

# repeat forever, almost
while True :

	# try to get input
	try    : question = input( PROMPT )
	except : exit( SALUTATION )
	
	# scale the threshold
	size = len( question.split() )
	if   size <= 2               : threshold = THRESHOLDSMALL
	elif size >  2 and size <= 4 : threshold = THRESHOLDMEDIUM
	else                         : threshold = THRESHOLDLARGE
	
	# do the work
	model     = TfidfVectorizer().fit_transform( [ question ] + questions )
	distances = cosine_similarity( model[ 0, : ], model[ 1:, : ] )
	results   = DataFrame( { 'distances':distances[ 0 ], 'questions':questions, 'answers':answers, 'scores':scores } ).sort_values( 'distances', ascending=False )

	# process (filter) the results
	for index, result in results.iterrows() :
	
		# parse
		distance = result[ 'distances' ]
		
		# check for significance
		if distance > threshold :
		
			# parse some more
			score = str(round( result[ 'scores' ], 2 ) )
			
			# output
			print( ' '.join( [ BULLET, result[ 'questions' ] + ' (' + result[ 'answers' ] + ' / ' + score + ')' ] ) ) 
	
# done
exit()
