Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
executable file 77 lines (64 sloc) 1.99 KB
#!/usr/bin/env python3
import re
import csv
import twarc
import rtyaml
import requests
import requests_html
http = requests_html.HTMLSession()
twitter = twarc.Twarc()
def main():
legis = rtyaml.load(open('../legislators.yaml'))
most = None
max_accounts = 0
out = csv.DictWriter(open('outgoing.csv', 'w'), fieldnames=[
"name",
"url",
"url_ok",
"user_id",
"new_url"
])
out.writeheader()
for p in legis:
if 'social' not in p:
continue
# see if the legislator was/is in the 116 and 117 congresses
is_116 = False
is_117 = False
for term in p['terms']:
if term['end'] == '2021-01-03':
is_116 = True
if term['start'] == '2021-01-03':
is_117 = True
# if they were in the 116 but not the 117 output their social media
if is_116 and not is_117:
row = {'name': p['name']['official_full']}
if 'twitter' in p['social']:
username = list(p['social']['twitter'].keys())[0]
row['url'] = 'https://twitter.com/{}'.format(username)
row['url_ok'] = check_url(row['url'])
if 'twitter_id' in p['social']:
row['user_id'] = list(p['social']['twitter_id'].keys())[0]
if row['url_ok'] == False:
row['new_url'] = get_new_url(row['user_id'])
out.writerow(row)
def check_url(url):
resp = http.get(url)
resp.html.render(sleep=10)
if resp.is_redirect or resp.status_code != 200:
return False
elif re.search(r'This account doesn’t exist', resp.html.text):
return False
else:
return True
def get_new_url(id):
id = str(id)
try:
user = next(twitter.user_lookup([id]))
if user:
return 'https://twitter.com/' + user['screen_name']
except requests.exceptions.HTTPError:
pass
return None
if __name__ == "__main__":
main()