Parallelised updating Alexa.com ranking of sites

Script now fetches Alexa ranks for sites concurrently on separate threads. Cuts down the time to sync ranks from approximately **5 minutes** to about **18 seconds**.
This commit is contained in:
Avinash Shenoy 2019-01-27 15:01:55 +05:30 committed by GitHub
parent 269df6d549
commit 1442f333c2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,56 +1,55 @@
"""Sherlock: Supported Site Listing
This module generates the listing of supported sites.
"""
import json
import sys
import requests
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import threading
from bs4 import BeautifulSoup as bs
from datetime import datetime
def get_rank(domain_to_query):
result = -1
url = "http://www.alexa.com/siteinfo/" + domain_to_query
page = requests.get(url).text
soup = bs(page, features="lxml")
for span in soup.find_all('span'):
if span.has_attr("class"):
if "globleRank" in span["class"]:
for strong in span.find_all("strong"):
if strong.has_attr("class"):
if "metrics-data" in strong["class"]:
result = int(strong.text.strip().replace(',', ''))
return result
pool = list()
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--rank","-r",
action="store_true", dest="rank", default=False,
help="Update all website ranks (not recommended)."
)
args = parser.parse_args()
def get_rank(domain_to_query, dest):
result = -1
url = "http://www.alexa.com/siteinfo/" + domain_to_query
page = requests.get(url).text
soup = bs(page, features="lxml")
for span in soup.find_all('span'):
if span.has_attr("class"):
if "globleRank" in span["class"]:
for strong in span.find_all("strong"):
if strong.has_attr("class"):
if "metrics-data" in strong["class"]:
result = int(strong.text.strip().replace(',', ''))
dest = result
with open("data.json", "r", encoding="utf-8") as data_file:
data = json.load(data_file)
data = json.load(data_file)
with open("sites.md", "w") as site_file:
data_length = len(data)
site_file.write(f'## List Of Supported Sites ({data_length} Sites In Total!)\n')
index = 1
for social_network in data:
url_main = data.get(social_network).get("urlMain")
th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)["rank"]))
pool.append((url_main, th))
th.start()
index = 1
for social_network, th in pool:
th.join()
site_file.write(f'{index}. [{social_network}]({url_main})\n')
if args.rank == True:
data.get(social_network)["rank"] = get_rank(url_main)
sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries"))
sys.stdout.flush()
sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries"))
sys.stdout.flush()
index = index + 1
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
sorted_json_data = json.dumps(data, indent=2, sort_keys=True)
with open("data.json", "w") as data_file:
data_file.write(sorted_json_data)
data_file.write(sorted_json_data)
sys.stdout.write("\r{0}".format(f"Finished updating supported site listing!\n"))
sys.stdout.flush()
print("\nFinished updating supported site listing!")