#!/usr/bin/env python # similaraaities-by-keyword.py - given a number of configurations, output a TSV file containing similar words to the keywords # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # August 8, 2022 - first documentation # configure CARREL = 'pamphlets' TYPE = 'similarity' SIZE = 32 LIMIT = 32 HEADER = [ 'source', 'target', 'weight' ] STOPWORDS = [ 'thy'] # require import rdr # initialize network = [ '\t'.join( HEADER ) ] # create a lexicon from the given carrel's list of keywords lexicon = [] keywords = rdr.keywords( CARREL, count=True ).split( '\n' ) for index, keyword in enumerate( keywords ) : keyword = keyword.split( '\t' )[ 0 ] lexicon.append( keyword ) if index > LIMIT : break # process each word in the lexicon for word in lexicon : # skip stopwords and phrases if word in STOPWORDS : continue if ' ' in word : continue # find and process each similar word to the given word records = rdr.word2vec( CARREL, type=TYPE, topn=SIZE, query=word ).split( '\n' ) for record in records : # parse source = word target = record.split( '\t' )[ 0 ] weight = record.split( '\t' )[ 1 ] # update the network network.append( '\t'.join( [ source, target, weight ] ) ) # output and done print( '\n'.join( network ) ) exit()