#!/usr/bin/env python

# similaraaities-by-keyword.py - given a number of configurations, output a TSV file containing similar words to the keywords

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# August 8, 2022 - first documentation


# configure
CARREL    = 'pamphlets'
TYPE      = 'similarity'
SIZE      = 32
LIMIT     = 32
HEADER    = [ 'source', 'target', 'weight' ]
STOPWORDS = [ 'thy']

# require
import rdr

# initialize
network = [ '\t'.join( HEADER ) ]

# create a lexicon from the given carrel's list of keywords
lexicon  = []
keywords = rdr.keywords( CARREL, count=True ).split( '\n' )
for index, keyword in enumerate( keywords ) :
	
	keyword = keyword.split( '\t' )[ 0 ]
	lexicon.append( keyword )
	if index > LIMIT : break

# process each word in the lexicon
for word in lexicon :

	# skip stopwords and phrases
	if word in STOPWORDS : continue
	if ' '  in word      : continue
	
	# find and process each similar word to the given word
	records = rdr.word2vec( CARREL, type=TYPE, topn=SIZE, query=word ).split( '\n' )
	for record in records :
	
		# parse
		source = word
		target = record.split( '\t' )[ 0 ]
		weight = record.split( '\t' )[ 1 ]
		
		# update the network
		network.append( '\t'.join( [ source, target, weight ] ) )

# output and done
print( '\n'.join( network ) )
exit()