#!/usr/bin/env python # results2cache.py - given a CSV file and output directory, fill the directory with content for the Reader # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # March 25, 2023 - first investigations; at the cabin # configure METADATA = 'metadata.csv' # require from pathlib import Path import multiprocessing import pandas as pd import requests import sys # cache/download def cache( directory, url ) : # parse filename = url[ 1 ] url = url[ 0 ] # debug, request, and debug some more sys.stderr.write( 'Getting ' + url + ' ' ) response = requests.get( url ) code = response.status_code sys.stderr.write( str( code ) + '\n' ) # save, conditionally if code == 200 : with open( directory/filename, 'wb' ) as handle : handle.write( response.content ) # on our mark, get set, go if __name__ == '__main__' : # get input if len( sys.argv ) != 3 : sys.exit( "Usage: " + sys.argv[ 0 ] + " " ) csv = sys.argv[ 1 ] directory = sys.argv[ 2 ] # make sane directory = Path( directory ) directory.mkdir( exist_ok=True ) # import results and create a file column results = pd.read_csv( csv ) results[ 'file' ] = results[ 'url' ].str.split( '/' ).str[ -1 ] # create a list of urls and file names, and then cache them; tricky urls = list( zip( results[ 'url' ].tolist(), results[ 'file' ].tolist() ) ) pool = multiprocessing.Pool() pool.starmap( cache, [ [ directory, url ] for url in urls ] ) pool.close() # output metadata with open( directory/METADATA, 'w' ) as handle : handle.write( results.to_csv( index=False ) ) # done exit()