2019-12-29 06:50:06 +00:00
|
|
|
"""Sherlock Sites Information Module
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
This module supports storing information about websites.
|
2019-12-29 06:50:06 +00:00
|
|
|
This is the raw data that will be used to search for usernames.
|
|
|
|
"""
|
|
|
|
import json
|
|
|
|
import requests
|
2022-12-03 13:01:05 +00:00
|
|
|
import secrets
|
2019-12-29 06:50:06 +00:00
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
class SiteInformation:
|
2020-08-07 16:57:51 +00:00
|
|
|
def __init__(self, name, url_home, url_username_format, username_claimed,
|
2023-02-05 18:05:30 +00:00
|
|
|
information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
|
2019-12-29 06:50:06 +00:00
|
|
|
"""Create Site Information Object.
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
Contains information about a specific website.
|
2019-12-29 06:50:06 +00:00
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
name -- String which identifies site.
|
|
|
|
url_home -- String containing URL for home of site.
|
|
|
|
url_username_format -- String containing URL for Username format
|
|
|
|
on site.
|
|
|
|
NOTE: The string should contain the
|
|
|
|
token "{}" where the username should
|
|
|
|
be substituted. For example, a string
|
|
|
|
of "https://somesite.com/users/{}"
|
|
|
|
indicates that the individual
|
|
|
|
usernames would show up under the
|
|
|
|
"https://somesite.com/users/" area of
|
2022-01-31 10:06:29 +00:00
|
|
|
the website.
|
2019-12-29 06:50:06 +00:00
|
|
|
username_claimed -- String containing username which is known
|
2022-01-31 10:06:29 +00:00
|
|
|
to be claimed on website.
|
2019-12-29 06:50:06 +00:00
|
|
|
username_unclaimed -- String containing username which is known
|
2022-01-31 10:06:29 +00:00
|
|
|
to be unclaimed on website.
|
2019-12-29 06:50:06 +00:00
|
|
|
information -- Dictionary containing all known information
|
2022-01-31 10:06:29 +00:00
|
|
|
about website.
|
2019-12-29 06:50:06 +00:00
|
|
|
NOTE: Custom information about how to
|
|
|
|
actually detect the existence of the
|
|
|
|
username will be included in this
|
|
|
|
dictionary. This information will
|
|
|
|
be needed by the detection method,
|
|
|
|
but it is only recorded in this
|
|
|
|
object for future use.
|
2022-11-23 00:10:23 +00:00
|
|
|
is_nsfw -- Boolean indicating if site is Not Safe For Work.
|
2019-12-29 06:50:06 +00:00
|
|
|
|
|
|
|
Return Value:
|
|
|
|
Nothing.
|
|
|
|
"""
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
self.name = name
|
|
|
|
self.url_home = url_home
|
2019-12-29 06:50:06 +00:00
|
|
|
self.url_username_format = url_username_format
|
2019-12-31 20:48:21 +00:00
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
self.username_claimed = username_claimed
|
2022-12-03 13:01:05 +00:00
|
|
|
self.username_unclaimed = secrets.token_urlsafe(32)
|
2022-01-31 10:06:29 +00:00
|
|
|
self.information = information
|
2022-10-01 08:36:31 +00:00
|
|
|
self.is_nsfw = is_nsfw
|
|
|
|
|
2019-12-29 06:50:06 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
"""Convert Object To String.
|
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
|
|
|
|
Return Value:
|
|
|
|
Nicely formatted string to get information about this object.
|
|
|
|
"""
|
2022-03-23 20:40:00 +00:00
|
|
|
|
2019-12-29 06:50:06 +00:00
|
|
|
return f"{self.name} ({self.url_home})"
|
|
|
|
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
class SitesInformation:
|
2019-12-29 06:50:06 +00:00
|
|
|
def __init__(self, data_file_path=None):
|
|
|
|
"""Create Sites Information Object.
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
Contains information about all supported websites.
|
2019-12-29 06:50:06 +00:00
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
data_file_path -- String which indicates path to data file.
|
|
|
|
The file name must end in ".json".
|
|
|
|
|
|
|
|
There are 3 possible formats:
|
|
|
|
* Absolute File Format
|
|
|
|
For example, "c:/stuff/data.json".
|
|
|
|
* Relative File Format
|
|
|
|
The current working directory is used
|
|
|
|
as the context.
|
|
|
|
For example, "data.json".
|
|
|
|
* URL Format
|
|
|
|
For example,
|
|
|
|
"https://example.com/data.json", or
|
|
|
|
"http://example.com/data.json".
|
|
|
|
|
|
|
|
An exception will be thrown if the path
|
|
|
|
to the data file is not in the expected
|
|
|
|
format, or if there was any problem loading
|
|
|
|
the file.
|
|
|
|
|
|
|
|
If this option is not specified, then a
|
|
|
|
default site list will be used.
|
|
|
|
|
|
|
|
Return Value:
|
|
|
|
Nothing.
|
|
|
|
"""
|
|
|
|
|
2021-04-29 22:43:48 +00:00
|
|
|
if not data_file_path:
|
2020-08-07 16:38:19 +00:00
|
|
|
# The default data file is the live data.json which is in the GitHub repo. The reason why we are using
|
2022-01-31 10:06:29 +00:00
|
|
|
# this instead of the local one is so that the user has the most up-to-date data. This prevents
|
2020-08-07 16:38:19 +00:00
|
|
|
# users from creating issue about false positives which has already been fixed or having outdated data
|
2024-07-08 09:51:28 +00:00
|
|
|
data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
|
2020-08-07 16:38:19 +00:00
|
|
|
|
|
|
|
# Ensure that specified data file has correct extension.
|
|
|
|
if not data_file_path.lower().endswith(".json"):
|
2020-09-03 09:14:43 +00:00
|
|
|
raise FileNotFoundError(f"Incorrect JSON file extension for data file '{data_file_path}'.")
|
2019-12-29 06:50:06 +00:00
|
|
|
|
2021-04-29 22:43:48 +00:00
|
|
|
# if "http://" == data_file_path[:7].lower() or "https://" == data_file_path[:8].lower():
|
2022-03-26 12:03:12 +00:00
|
|
|
if data_file_path.lower().startswith("http"):
|
2020-08-07 16:38:19 +00:00
|
|
|
# Reference is to a URL.
|
2019-12-29 06:50:06 +00:00
|
|
|
try:
|
|
|
|
response = requests.get(url=data_file_path)
|
|
|
|
except Exception as error:
|
2022-01-31 10:06:29 +00:00
|
|
|
raise FileNotFoundError(
|
|
|
|
f"Problem while attempting to access data file URL '{data_file_path}': {error}"
|
|
|
|
)
|
|
|
|
|
|
|
|
if response.status_code != 200:
|
2019-12-29 06:50:06 +00:00
|
|
|
raise FileNotFoundError(f"Bad response while accessing "
|
|
|
|
f"data file URL '{data_file_path}'."
|
2022-01-31 10:06:29 +00:00
|
|
|
)
|
|
|
|
try:
|
|
|
|
site_data = response.json()
|
|
|
|
except Exception as error:
|
|
|
|
raise ValueError(
|
|
|
|
f"Problem parsing json contents at '{data_file_path}': {error}."
|
|
|
|
)
|
|
|
|
|
2019-12-29 06:50:06 +00:00
|
|
|
else:
|
2020-11-12 18:43:24 +00:00
|
|
|
# Reference is to a file.
|
2019-12-29 06:50:06 +00:00
|
|
|
try:
|
|
|
|
with open(data_file_path, "r", encoding="utf-8") as file:
|
|
|
|
try:
|
|
|
|
site_data = json.load(file)
|
|
|
|
except Exception as error:
|
2022-01-31 10:06:29 +00:00
|
|
|
raise ValueError(
|
|
|
|
f"Problem parsing json contents at '{data_file_path}': {error}."
|
|
|
|
)
|
|
|
|
|
|
|
|
except FileNotFoundError:
|
2019-12-29 06:50:06 +00:00
|
|
|
raise FileNotFoundError(f"Problem while attempting to access "
|
|
|
|
f"data file '{data_file_path}'."
|
2022-01-31 10:06:29 +00:00
|
|
|
)
|
2024-05-06 04:09:00 +00:00
|
|
|
|
2024-05-08 04:08:45 +00:00
|
|
|
site_data.pop('$schema', None)
|
2019-12-29 06:50:06 +00:00
|
|
|
|
|
|
|
self.sites = {}
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
# Add all site information from the json file to internal site list.
|
2019-12-29 06:50:06 +00:00
|
|
|
for site_name in site_data:
|
|
|
|
try:
|
2019-12-31 20:48:21 +00:00
|
|
|
|
2019-12-29 06:50:06 +00:00
|
|
|
self.sites[site_name] = \
|
|
|
|
SiteInformation(site_name,
|
|
|
|
site_data[site_name]["urlMain"],
|
|
|
|
site_data[site_name]["url"],
|
|
|
|
site_data[site_name]["username_claimed"],
|
2022-10-01 08:36:31 +00:00
|
|
|
site_data[site_name],
|
|
|
|
site_data[site_name].get("isNSFW",False)
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
)
|
2019-12-29 06:50:06 +00:00
|
|
|
except KeyError as error:
|
2022-01-31 10:06:29 +00:00
|
|
|
raise ValueError(
|
|
|
|
f"Problem parsing json contents at '{data_file_path}': Missing attribute {error}."
|
|
|
|
)
|
2024-05-19 03:09:04 +00:00
|
|
|
except TypeError:
|
2024-05-08 02:32:49 +00:00
|
|
|
print(f"Encountered TypeError parsing json contents for target '{site_name}' at {data_file_path}\nSkipping target.\n")
|
2019-12-29 06:50:06 +00:00
|
|
|
|
|
|
|
return
|
|
|
|
|
2024-05-07 07:13:05 +00:00
|
|
|
def remove_nsfw_sites(self, do_not_remove: list = []):
|
2022-10-01 08:36:31 +00:00
|
|
|
"""
|
|
|
|
Remove NSFW sites from the sites, if isNSFW flag is true for site
|
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
|
|
|
|
Return Value:
|
|
|
|
None
|
|
|
|
"""
|
2022-10-01 17:44:44 +00:00
|
|
|
sites = {}
|
2024-05-07 07:05:52 +00:00
|
|
|
do_not_remove = [site.casefold() for site in do_not_remove]
|
2022-10-01 17:44:44 +00:00
|
|
|
for site in self.sites:
|
2024-05-07 07:05:52 +00:00
|
|
|
if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
|
2022-10-01 17:44:44 +00:00
|
|
|
continue
|
|
|
|
sites[site] = self.sites[site]
|
|
|
|
self.sites = sites
|
2022-10-01 08:36:31 +00:00
|
|
|
|
2020-08-07 16:57:51 +00:00
|
|
|
def site_name_list(self):
|
2019-12-31 20:48:21 +00:00
|
|
|
"""Get Site Name List.
|
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
|
|
|
|
Return Value:
|
|
|
|
List of strings containing names of sites.
|
|
|
|
"""
|
|
|
|
|
2022-01-31 10:06:29 +00:00
|
|
|
return sorted([site.name for site in self], key=str.lower)
|
2019-12-31 20:48:21 +00:00
|
|
|
|
2019-12-29 06:50:06 +00:00
|
|
|
def __iter__(self):
|
|
|
|
"""Iterator For Object.
|
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
|
|
|
|
Return Value:
|
|
|
|
Iterator for sites object.
|
|
|
|
"""
|
|
|
|
|
2020-01-01 03:18:49 +00:00
|
|
|
for site_name in self.sites:
|
|
|
|
yield self.sites[site_name]
|
2019-12-29 06:50:06 +00:00
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""Length For Object.
|
|
|
|
|
|
|
|
Keyword Arguments:
|
|
|
|
self -- This object.
|
|
|
|
|
|
|
|
Return Value:
|
|
|
|
Length of sites object.
|
|
|
|
"""
|
|
|
|
return len(self.sites)
|