sherlock/sherlock/sherlock.py

670 lines
27 KiB
Python
Raw Normal View History

#! /usr/bin/env python3
"""
Sherlock: Find Usernames Across Social Networks Module
This module contains the main logic to search for usernames at social
networks.
"""
import csv
2018-12-24 14:31:34 +00:00
import json
import os
import platform
import re
import sys
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from time import monotonic
from concurrent.futures import ThreadPoolExecutor
from time import time
2020-03-13 14:54:44 +00:00
import webbrowser
import requests
from colorama import Fore, Style, init
from requests_futures.sessions import FuturesSession
2018-12-29 00:50:25 +00:00
from torrequest import TorRequest
from result import QueryStatus
from result import QueryResult
from sites import SitesInformation
2018-12-24 14:31:34 +00:00
module_name = "Sherlock: Find Usernames Across Social Networks"
__version__ = "0.11.0"
2018-12-25 18:14:39 +00:00
class SherlockFuturesSession(FuturesSession):
def request(self, method, url, hooks={}, *args, **kwargs):
"""Request URL.
This extends the FuturesSession request method to calculate a response
time metric to each request.
It is taken (almost) directly from the following StackOverflow answer:
https://github.com/ross/requests-futures#working-in-the-background
Keyword Arguments:
self -- This object.
method -- String containing method desired for request.
url -- String containing URL for request.
hooks -- Dictionary containing hooks to execute after
request finishes.
args -- Arguments.
kwargs -- Keyword arguments.
Return Value:
Request object.
"""
#Record the start time for the request.
start = monotonic()
def response_time(resp, *args, **kwargs):
"""Response Time Hook.
Keyword Arguments:
resp -- Response object.
args -- Arguments.
kwargs -- Keyword arguments.
Return Value:
N/A
"""
resp.elapsed = monotonic() - start
return
#Install hook to execute when response completes.
#Make sure that the time measurement hook is first, so we will not
#track any later hook's execution time.
try:
if isinstance(hooks['response'], list):
hooks['response'].insert(0, response_time)
elif isinstance(hooks['response'], tuple):
#Convert tuple to list and insert time measurement hook first.
hooks['response'] = list(hooks['response'])
hooks['response'].insert(0, response_time)
else:
#Must have previously contained a single hook function,
#so convert to list.
hooks['response'] = [response_time, hooks['response']]
except KeyError:
#No response hook was already defined, so install it ourselves.
hooks['response'] = [response_time]
return super(SherlockFuturesSession, self).request(method,
url,
hooks=hooks,
*args, **kwargs)
2019-12-19 23:37:12 +00:00
def print_info(title, info, color=True):
if color:
print(Style.BRIGHT + Fore.GREEN + "[" +
Fore.YELLOW + "*" +
Fore.GREEN + f"] {title}" +
Fore.WHITE + f" {info}" +
Fore.GREEN + " on:")
else:
print(f"[*] {title} {info} on:")
def print_error(social_network, err, errstr, var, verbose=False, color=True):
2019-12-19 23:37:12 +00:00
if color:
print(Style.BRIGHT + Fore.WHITE + "[" +
Fore.RED + "-" +
Fore.WHITE + "]" +
Fore.GREEN + f" {social_network}:" +
2019-12-19 23:37:12 +00:00
Fore.RED + f" {errstr}" +
Fore.YELLOW + f" {err if verbose else var}")
else:
print(f"[-] {social_network}: {errstr} {err if verbose else var}")
def format_response_time(response_time, verbose):
return f" [{round(response_time * 1000)} ms]" if verbose else ""
2019-12-19 23:37:12 +00:00
def print_found(social_network, url, response_time, verbose=False, color=True):
if color:
print((Style.BRIGHT + Fore.WHITE + "[" +
Fore.GREEN + "+" +
Fore.WHITE + "]" +
format_response_time(response_time, verbose) +
Fore.GREEN + f" {social_network}:"), url)
else:
print(f"[+]{format_response_time(response_time, verbose)} {social_network}: {url}")
def print_not_found(social_network, response_time, verbose=False, color=True):
if color:
print((Style.BRIGHT + Fore.WHITE + "[" +
Fore.RED + "-" +
Fore.WHITE + "]" +
format_response_time(response_time, verbose) +
Fore.GREEN + f" {social_network}:" +
Fore.YELLOW + " Not Found!"))
else:
print(f"[-]{format_response_time(response_time, verbose)} {social_network}: Not Found!")
2019-12-19 23:37:12 +00:00
def print_invalid(social_network, msg, color=True):
"""Print invalid search result."""
2019-12-19 23:37:12 +00:00
if color:
print((Style.BRIGHT + Fore.WHITE + "[" +
Fore.RED + "-" +
Fore.WHITE + "]" +
Fore.GREEN + f" {social_network}:" +
Fore.YELLOW + f" {msg}"))
else:
print(f"[-] {social_network} {msg}")
2018-12-26 17:25:28 +00:00
def get_response(request_future, error_type, social_network, verbose=False, color=True):
#Default for Response object if some failure occurs.
response = None
error_context = "General Unknown Error"
expection_text = None
try:
response = request_future.result()
if response.status_code:
#status code exists in response object
error_context = None
except requests.exceptions.HTTPError as errh:
error_context = "HTTP Error"
expection_text = str(errh)
except requests.exceptions.ProxyError as errp:
error_context = "Proxy Error"
expection_text = str(errp)
except requests.exceptions.ConnectionError as errc:
error_context = "Error Connecting"
expection_text = str(errc)
except requests.exceptions.Timeout as errt:
error_context = "Timeout Error"
expection_text = str(errt)
except requests.exceptions.RequestException as err:
error_context = "Unknown Error"
expection_text = str(err)
return response, error_context, expection_text
2018-12-24 14:31:34 +00:00
def sherlock(username, site_data, verbose=False, tor=False, unique_tor=False,
proxy=None, print_found_only=False, timeout=None, color=True,
print_output=True):
"""Run Sherlock Analysis.
Checks for existence of username on various social media sites.
2018-12-30 23:46:02 +00:00
Keyword Arguments:
username -- String indicating username that report
should be created against.
site_data -- Dictionary containing all of the site data.
2018-12-29 14:59:30 +00:00
verbose -- Boolean indicating whether to give verbose output.
tor -- Boolean indicating whether to use a tor circuit for the requests.
unique_tor -- Boolean indicating whether to use a new tor circuit for each request.
2019-01-12 04:55:19 +00:00
proxy -- String indicating the proxy URL
print_found_only -- Boolean indicating whether to only print found sites.
timeout -- Time in seconds to wait before timing out request.
Default is no timeout.
2019-12-19 23:37:12 +00:00
color -- Boolean indicating whether to color terminal output
print_output -- Boolean indicating whether the output should be
printed. Default is True.
Return Value:
2019-11-26 16:42:06 +00:00
Dictionary containing results from report. Key of dictionary is the name
of the social network site, and the value is another dictionary with
the following keys:
url_main: URL of main site.
url_user: URL of user on site (if account exists).
status: QueryResult() object indicating results of test for
account existence.
http_status: HTTP status code of query which checked for existence on
site.
response_text: Text that came back from request. May be None if
there was an HTTP error when checking for existence.
"""
if print_output == True:
print_info("Checking username", username, color)
2018-12-24 14:31:34 +00:00
# Create session based on request methodology
if tor or unique_tor:
#Requests using Tor obfuscation
underlying_request = TorRequest()
2019-01-12 04:55:19 +00:00
underlying_session = underlying_request.session
else:
#Normal requests
underlying_session = requests.session()
underlying_request = requests.Request()
#Limit number of workers to 20.
#This is probably vastly overkill.
if len(site_data) >= 20:
max_workers=20
else:
max_workers=len(site_data)
#Create multi-threaded session for all requests.
session = SherlockFuturesSession(max_workers=max_workers,
session=underlying_session)
# Results from analysis of all sites
results_total = {}
# First create futures for all requests. This allows for the requests to run in parallel
for social_network, net_info in site_data.items():
# Results from analysis of this specific site
results_site = {}
# Record URL of main site
results_site['url_main'] = net_info.get("urlMain")
# A user agent is needed because some sites don't return the correct
# information since they think that we are bots (Which we actually are...)
2019-10-01 09:59:32 +00:00
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}
2019-10-01 09:59:32 +00:00
if "headers" in net_info:
2019-11-26 16:42:06 +00:00
# Override/append any extra headers required by a given site.
2019-10-01 09:59:32 +00:00
headers.update(net_info["headers"])
# Don't make request if username is invalid for the site
2018-12-30 22:39:08 +00:00
regex_check = net_info.get("regexCheck")
if regex_check and re.search(regex_check, username) is None:
# No need to do the check at the site: this user name is not allowed.
if (print_output == True) and not print_found_only:
2019-12-19 23:37:12 +00:00
print_invalid(social_network, "Illegal Username Format For This Site!", color)
2019-11-29 14:01:05 +00:00
results_site['status'] = QueryResult(QueryStatus.ILLEGAL)
results_site["url_user"] = ""
results_site['http_status'] = ""
results_site['response_text'] = ""
results_site['response_time_s'] = ""
else:
# URL of user on site (if it exists)
url = net_info["url"].format(username)
results_site["url_user"] = url
url_probe = net_info.get("urlProbe")
if url_probe is None:
2019-11-26 16:42:06 +00:00
# Probe URL is normal one seen by people out on the web.
url_probe = url
else:
2019-11-26 16:42:06 +00:00
# There is a special URL for probing existence separate
# from where the user profile normally can be found.
url_probe = url_probe.format(username)
#If only the status_code is needed don't download the body
if net_info["errorType"] == 'status_code':
request_method = session.head
else:
request_method = session.get
if net_info["errorType"] == "response_url":
# Site forwards request to a different URL if username not
# found. Disallow the redirect so we can capture the
# http status from the original URL request.
allow_redirects = False
else:
# Allow whatever redirect that the site wants to do.
# The final result of the request will be what is available.
allow_redirects = True
# This future starts running the request in a new thread, doesn't block the main thread
2019-01-12 04:55:19 +00:00
if proxy != None:
proxies = {"http": proxy, "https": proxy}
future = request_method(url=url_probe, headers=headers,
proxies=proxies,
2019-12-04 16:42:29 +00:00
allow_redirects=allow_redirects,
timeout=timeout
)
2019-01-12 04:55:19 +00:00
else:
future = request_method(url=url_probe, headers=headers,
2019-12-04 16:42:29 +00:00
allow_redirects=allow_redirects,
timeout=timeout
)
# Store future in data for access later
net_info["request_future"] = future
# Reset identify for tor (if needed)
if unique_tor:
underlying_request.reset_identity()
# Add this site's results into final dictionary with all of the other results.
results_total[social_network] = results_site
2019-01-05 12:43:03 +00:00
# Open the file containing account links
# Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
for social_network, net_info in site_data.items():
# Retrieve results again
results_site = results_total.get(social_network)
# Retrieve other site information again
url = results_site.get("url_user")
status = results_site.get("status")
if status is not None:
# We have already determined the user doesn't exist here
continue
# Get the expected error type
error_type = net_info["errorType"]
2018-12-24 14:31:34 +00:00
# Retrieve future and ensure it has finished
future = net_info["request_future"]
r, error_text, expection_text = get_response(request_future=future,
error_type=error_type,
social_network=social_network,
verbose=verbose,
color=color)
#Get response time for response of our request.
try:
response_time = r.elapsed
except AttributeError:
response_time = None
2018-12-25 18:14:39 +00:00
# Attempt to get request information
try:
http_status = r.status_code
except:
http_status = "?"
try:
response_text = r.text.encode(r.encoding)
except:
response_text = ""
2018-12-25 18:14:39 +00:00
if error_text is not None:
result = QueryResult(QueryStatus.UNKNOWN, error_text)
elif error_type == "message":
2018-12-30 22:39:08 +00:00
error = net_info.get("errorMsg")
# Checks if the error message is in the HTML
if not error in r.text:
result = QueryResult(QueryStatus.CLAIMED)
2018-12-24 14:31:34 +00:00
else:
result = QueryResult(QueryStatus.AVAILABLE)
2018-12-24 14:31:34 +00:00
elif error_type == "status_code":
2019-01-04 00:04:52 +00:00
# Checks if the status code of the response is 2XX
if not r.status_code >= 300 or r.status_code < 200:
result = QueryResult(QueryStatus.CLAIMED)
2018-12-24 14:31:34 +00:00
else:
result = QueryResult(QueryStatus.AVAILABLE)
2018-12-24 14:31:34 +00:00
elif error_type == "response_url":
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
# code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect).
if 200 <= r.status_code < 300:
result = QueryResult(QueryStatus.CLAIMED)
2018-12-24 14:31:34 +00:00
else:
result = QueryResult(QueryStatus.AVAILABLE)
else:
#It should be impossible to ever get here...
raise ValueError(f"Unknown Error Type '{error_type}' for "
f"site '{social_network}'")
if print_output == True:
#Output to the terminal is desired.
if result.status == QueryStatus.CLAIMED:
print_found(social_network, url, response_time, verbose, color)
elif result.status == QueryStatus.AVAILABLE:
if not print_found_only:
print_not_found(social_network, response_time, verbose, color)
elif result.status == QueryStatus.UNKNOWN:
print_error(social_network, expection_text, error_text, "", verbose, color)
elif result.status == QueryStatus.ILLEGAL:
if not print_found_only:
print_invalid(social_network, "Illegal Username Format For This Site!", color)
else:
#It should be impossible to ever get here...
raise ValueError(f"Unknown Query Status '{str(result.status)}' for "
f"site '{social_network}'")
# Save status of request
results_site['status'] = result
# Save results from request
results_site['http_status'] = http_status
results_site['response_text'] = response_text
results_site['response_time_s'] = response_time
# Add this site's results into final dictionary with all of the other results.
results_total[social_network] = results_site
return results_total
def timeout_check(value):
"""Check Timeout Argument.
Checks timeout for validity.
Keyword Arguments:
value -- Time in seconds to wait before timing out request.
Return Value:
Floating point number representing the time (in seconds) that should be
used for the timeout.
NOTE: Will raise an exception if the timeout in invalid.
"""
from argparse import ArgumentTypeError
try:
timeout = float(value)
except:
raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
if timeout <= 0:
raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
return timeout
def main():
2019-01-03 10:18:04 +00:00
# Colorama module's initialization.
2019-01-04 05:17:23 +00:00
init(autoreset=True)
2019-01-03 10:18:04 +00:00
version_string = f"%(prog)s {__version__}\n" + \
f"{requests.__description__}: {requests.__version__}\n" + \
f"Python: {platform.python_version()}"
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
description=f"{module_name} (Version {__version__})"
2019-01-12 04:55:19 +00:00
)
parser.add_argument("--version",
action="version", version=version_string,
help="Display version information and dependencies."
2019-01-12 04:55:19 +00:00
)
parser.add_argument("--verbose", "-v", "-d", "--debug",
action="store_true", dest="verbose", default=False,
help="Display extra debugging information and metrics."
2019-01-12 04:55:19 +00:00
)
2019-01-25 15:05:38 +00:00
parser.add_argument("--rank", "-r",
action="store_true", dest="rank", default=False,
2019-01-25 15:10:03 +00:00
help="Present websites ordered by their Alexa.com global rank in popularity.")
parser.add_argument("--folderoutput", "-fo", dest="folderoutput",
2019-11-26 16:42:06 +00:00
help="If using multiple usernames, the output of the results will be saved to this folder."
)
parser.add_argument("--output", "-o", dest="output",
2019-11-26 16:42:06 +00:00
help="If using single username, the output of the result will be saved to this file."
2019-01-12 04:55:19 +00:00
)
2018-12-29 00:50:25 +00:00
parser.add_argument("--tor", "-t",
action="store_true", dest="tor", default=False,
2019-04-10 09:15:55 +00:00
help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.")
parser.add_argument("--unique-tor", "-u",
action="store_true", dest="unique_tor", default=False,
2019-04-10 09:15:55 +00:00
help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.")
parser.add_argument("--csv",
action="store_true", dest="csv", default=False,
help="Create Comma-Separated Values (CSV) File."
2019-01-12 04:55:19 +00:00
)
parser.add_argument("--site",
action="append", metavar='SITE_NAME',
dest="site_list", default=None,
2019-11-26 16:42:06 +00:00
help="Limit analysis to just the listed sites. Add multiple options to specify more than one site."
2019-01-12 04:55:19 +00:00
)
parser.add_argument("--proxy", "-p", metavar='PROXY_URL',
action="store", dest="proxy", default=None,
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080"
)
parser.add_argument("--json", "-j", metavar="JSON_FILE",
dest="json_file", default=None,
help="Load data from a JSON file or an online, valid, JSON file.")
parser.add_argument("--timeout",
action="store", metavar='TIMEOUT',
dest="timeout", type=timeout_check, default=None,
help="Time (in seconds) to wait for response to requests. "
"Default timeout of 60.0s."
"A longer timeout will be more likely to get results from slow sites."
"On the other hand, this may cause a long delay to gather all results."
)
2019-02-26 16:54:49 +00:00
parser.add_argument("--print-found",
action="store_true", dest="print_found_only", default=False,
help="Do not output sites where the username was not found."
)
2019-12-19 23:37:12 +00:00
parser.add_argument("--no-color",
action="store_true", dest="no_color", default=False,
help="Don't color terminal output"
)
parser.add_argument("username",
nargs='+', metavar='USERNAMES',
action="store",
help="One or more usernames to check with social networks."
2019-01-12 04:55:19 +00:00
)
2020-03-13 14:54:44 +00:00
parser.add_argument("--browse", "-b",
action="store_true", dest="browse", default=False,
help="Browse to all results on default bowser.")
2018-12-26 17:25:28 +00:00
args = parser.parse_args()
2019-01-12 04:55:19 +00:00
# Argument check
# TODO regex check on args.proxy
if args.tor and (args.proxy != None):
raise Exception("Tor and Proxy cannot be set at the same time.")
2019-01-12 04:55:19 +00:00
# Make prompts
if args.proxy != None:
print("Using the proxy: " + args.proxy)
if args.tor or args.unique_tor:
2019-04-10 09:15:55 +00:00
print("Using Tor to make requests")
print("Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors.")
2018-12-25 18:14:39 +00:00
# Check if both output methods are entered as input.
if args.output is not None and args.folderoutput is not None:
print("You can only use one of the output methods.")
sys.exit(1)
# Check validity for single username output.
if args.output is not None and len(args.username) != 1:
print("You can only use --output with a single username")
sys.exit(1)
#Create object with all information about sites we are aware of.
try:
sites = SitesInformation(args.json_file)
except Exception as error:
print(f"ERROR: {error}")
sys.exit(1)
#Create original dictionary from SitesInformation() object.
#Eventually, the rest of the code will be updated to use the new object
#directly, but this will glue the two pieces together.
site_data_all = {}
for site in sites:
site_data_all[site.name] = site.information
if args.site_list is None:
# Not desired to look at a sub-set of sites
site_data = site_data_all
else:
# User desires to selectively run queries on a sub-set of the site list.
# Make sure that the sites are supported & build up pruned site database.
site_data = {}
site_missing = []
for site in args.site_list:
for existing_site in site_data_all:
if site.lower() == existing_site.lower():
site_data[existing_site] = site_data_all[existing_site]
2019-01-09 17:46:02 +00:00
if not site_data:
# Build up list of sites not supported for future error message.
site_missing.append(f"'{site}'")
2019-01-09 17:46:02 +00:00
if site_missing:
2019-01-12 04:55:19 +00:00
print(
f"Error: Desired sites not found: {', '.join(site_missing)}.")
sys.exit(1)
2019-01-25 15:05:38 +00:00
if args.rank:
# Sort data by rank
2019-01-25 15:05:38 +00:00
site_dataCpy = dict(site_data)
2019-01-25 17:36:38 +00:00
ranked_sites = sorted(site_data, key=lambda k: ("rank" not in k, site_data[k].get("rank", sys.maxsize)))
2019-01-25 15:05:38 +00:00
site_data = {}
2019-01-25 17:36:38 +00:00
for site in ranked_sites:
2019-01-25 15:05:38 +00:00
site_data[site] = site_dataCpy.get(site)
2018-12-29 01:45:19 +00:00
# Run report on all specified users.
for username in args.username:
print()
results = sherlock(username,
site_data,
verbose=args.verbose,
tor=args.tor,
unique_tor=args.unique_tor,
proxy=args.proxy,
print_found_only=args.print_found_only,
2019-12-19 23:37:12 +00:00
timeout=args.timeout,
color=not args.no_color)
2018-12-25 18:14:39 +00:00
if args.output:
result_file = args.output
elif args.folderoutput:
# The usernames results should be stored in a targeted folder.
# If the folder doesn't exist, create it first
os.makedirs(args.folderoutput, exist_ok=True)
result_file = os.path.join(args.folderoutput, f"{username}.txt")
else:
result_file = f"{username}.txt"
with open(result_file, "w", encoding="utf-8") as file:
exists_counter = 0
for website_name in results:
dictionary = results[website_name]
if dictionary.get("status").status == QueryStatus.CLAIMED:
exists_counter += 1
file.write(dictionary["url_user"] + "\n")
file.write(f"Total Websites Username Detected On : {exists_counter}")
if args.csv == True:
with open(username + ".csv", "w", newline='', encoding="utf-8") as csv_report:
writer = csv.writer(csv_report)
writer.writerow(['username',
'name',
'url_main',
'url_user',
'exists',
'http_status',
'response_time_s'
2019-01-12 04:55:19 +00:00
]
)
for site in results:
writer.writerow([username,
site,
results[site]['url_main'],
results[site]['url_user'],
str(results[site]['status'].status),
results[site]['http_status'],
results[site]['response_time_s']
2019-01-12 04:55:19 +00:00
]
)
2018-12-24 14:31:34 +00:00
if __name__ == "__main__":
2019-01-27 10:27:03 +00:00
main()