mirror of
https://github.com/sherlock-project/sherlock
synced 2024-11-25 21:30:24 +00:00
bd70be289b
Not sure why the API was not used earlier. It is much more efficient as much less data is passed. And, it is less likely to have its format changed randomly. It did flake out on some sites one time I used it. But, then it worked reliably next. I am not going to do many requests, as it probably has some query rate limit.
74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
"""Sherlock: Supported Site Listing
|
|
This module generates the listing of supported sites.
|
|
"""
|
|
import json
|
|
import sys
|
|
import requests
|
|
import threading
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime
|
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
|
|
|
pool = list()
|
|
|
|
def get_rank(domain_to_query, dest):
|
|
result = -1
|
|
|
|
#Retrieve ranking data via alexa API
|
|
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
|
|
xml_data = requests.get(url).text
|
|
root = ET.fromstring(xml_data)
|
|
try:
|
|
#Get ranking for this site.
|
|
dest['rank'] = int(root.find(".//REACH").attrib["RANK"])
|
|
except:
|
|
#We did not find the rank for some reason.
|
|
print(f"Error retrieving rank information for '{domain_to_query}'")
|
|
print(f" Returned XML is |{xml_data}|")
|
|
|
|
return
|
|
|
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
|
)
|
|
parser.add_argument("--rank","-r",
|
|
action="store_true", dest="rank", default=False,
|
|
help="Update all website ranks (not recommended)."
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
with open("data.json", "r", encoding="utf-8") as data_file:
|
|
data = json.load(data_file)
|
|
|
|
with open("sites.md", "w") as site_file:
|
|
data_length = len(data)
|
|
site_file.write(f'## List Of Supported Sites ({data_length} Sites In Total!)\n')
|
|
|
|
for social_network in data:
|
|
url_main = data.get(social_network).get("urlMain")
|
|
data.get(social_network)["rank"] = 0
|
|
if args.rank:
|
|
th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)))
|
|
else:
|
|
th = None
|
|
pool.append((social_network, url_main, th))
|
|
if args.rank:
|
|
th.start()
|
|
|
|
index = 1
|
|
for social_network, url_main, th in pool:
|
|
if args.rank:
|
|
th.join()
|
|
site_file.write(f'{index}. [{social_network}]({url_main})\n')
|
|
sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries"))
|
|
sys.stdout.flush()
|
|
index = index + 1
|
|
|
|
if args.rank:
|
|
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
|
|
|
sorted_json_data = json.dumps(data, indent=2, sort_keys=True)
|
|
|
|
with open("data.json", "w") as data_file:
|
|
data_file.write(sorted_json_data)
|
|
|
|
print("\nFinished updating supported site listing!")
|