#!/usr/bin/env python

# results2cache.py - given a CSV file and output directory, fill the directory with content for the Reader

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# March 25, 2023 - first investigations; at the cabin


# configure
METADATA = 'metadata.csv'

# require
from   pathlib import Path
import multiprocessing
import pandas as pd
import requests
import sys

# cache/download
def cache( directory, url ) :

	# parse
	filename = url[ 1 ]
	url      = url[ 0 ]
	
	# debug, request, and debug some more
	sys.stderr.write( 'Getting ' + url + ' ' )
	response = requests.get( url )
	code     = response.status_code
	sys.stderr.write( str( code ) + '\n' )
	
	# save, conditionally
	if code == 200 :
	
		with open( directory/filename, 'wb' ) as handle : handle.write( response.content )


# on our mark, get set, go
if __name__ == '__main__' : 

	# get input
	if len( sys.argv ) != 3 : sys.exit( "Usage: " + sys.argv[ 0 ] + " <csv> <directory>" )
	csv       = sys.argv[ 1 ]
	directory = sys.argv[ 2 ]

	# make sane
	directory = Path( directory )
	directory.mkdir( exist_ok=True )
	
	# import results and create a file column
	results = pd.read_csv( csv )
	results[ 'file' ] = results[ 'url' ].str.split( '/' ).str[ -1 ]
	
	# create a list of urls and file names, and then cache them; tricky
	urls = list( zip( results[ 'url' ].tolist(), results[ 'file' ].tolist() ) )	
	pool = multiprocessing.Pool()
	pool.starmap( cache, [ [ directory, url ] for url in urls ] )
	pool.close()

	# output metadata
	with open( directory/METADATA, 'w' ) as handle : handle.write( results.to_csv( index=False ) )

	# done
	exit()