parameters collection for scrapping automated & tested \ command line arguments added & tested \ some enhancements to 2 functions \ utils.py addded, code refactored & tested \ updated readme

This commit is contained in:
Elyes Manai 2020-03-28 17:24:26 +01:00
parent e64ed73afb
commit 43d626514e
3 changed files with 374 additions and 298 deletions

121
params.json Normal file
View file

@ -0,0 +1,121 @@
{
"Friends": {
"scan_list": [
"All",
"Mutual Friends",
"Following",
"Followers",
"Work",
"College",
"Current City",
"Hometown"
],
"section": [
"/friends",
"/friends_mutual",
"/following",
"/followers",
"/friends_work",
"/friends_college",
"/friends_current_city",
"/friends_hometown"
],
"elements_path": [
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
],
"file_names": [
"All Friends.txt",
"Mutual Friends.txt",
"Following.txt",
"Followers.txt",
"Work Friends.txt",
"College Friends.txt",
"Current City Friends.txt",
"Hometown Friends.txt"
],
"save_status":0
},
"Photos": {
"scan_list": [
"'s Photos",
"Photos of"
],
"section": [
"/photos_all",
"/photos_of"
],
"elements_path": [
"//*[contains(@id, 'pic_')]",
"//*[contains(@id, 'pic_')]"
],
"file_names": [
"Uploaded Photos.txt",
"Tagged Photos.txt"
],
"save_status": 1
},
"Videos": {
"scan_list": [
"'s Videos",
"Videos of"
],
"section": [
"/videos_by",
"/videos_of"
],
"elements_path": [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
],
"file_names": [
"Uploaded Videos.txt",
"Tagged Videos.txt"
],
"save_status": 2
},
"About": {
"scan_list": [],
"section": [
"/about?section=overview",
"/about?section=education",
"/about?section=living",
"/about?section=contact-info",
"/about?section=relationship",
"/about?section=bio",
"/about?section=year-overviews"
],
"elements_path": [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
],
"file_names": [
"Overview.txt",
"Work and Education.txt",
"Places Lived.txt",
"Contact and Basic Info.txt",
"Family and Relationships.txt",
"Details About.txt",
"Life Events.txt"
],
"save_status": 3
},
"Posts": {
"scan_list": [],
"section": [],
"elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
"file_names": ["Posts.txt"],
"save_status": 4
}
}

View file

@ -5,6 +5,8 @@ import platform
import sys
import urllib.request
import yaml
import utils
import argparse
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# -------------------------------------------------------------
# -------------------------------------------------------------
# Global Variables
driver = None
# whether to download photos or not
download_uploaded_photos = True
download_friends_photos = True
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its false then it will open each photo to download it
# and it will take much more time
friends_small_size = True
photos_small_size = True
total_scrolls = 2500
current_scrolls = 0
scroll_time = 8
with open("selectors.json") as json_file:
selectors = json.load(json_file)
old_height = 0
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
CHROMEDRIVER_BINARIES_FOLDER = "bin"
# -------------------------------------------------------------
# -------------------------------------------------------------
def get_facebook_images_url(img_links):
urls = []
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
parent = os.getcwd()
try:
folder = os.path.join(os.getcwd(), folder_name)
create_folder(folder)
utils.create_folder(folder)
os.chdir(folder)
except Exception:
print("Error in changing directory.")
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
# -------------------------------------------------------------
def check_height():
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
# -------------------------------------------------------------
# -------------------------------------------------------------
# helper function: used to scroll the page
def scroll():
global old_height
current_scrolls = 0
while True:
try:
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height()
)
current_scrolls += 1
except TimeoutException:
break
return
# -------------------------------------------------------------
# -------------------------------------------------------------
# --Helper Functions for Posts
def get_status(x):
status = ""
try:
status = x.find_element_by_xpath(
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
def get_div_links(x, tag):
try:
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
def get_title_links(title):
l = title.find_elements_by_tag_name("a")
return l[-1].text, l[-1].get_attribute("href")
def get_title(x):
title = ""
try:
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
return title
def get_time(x):
time = ""
try:
time = x.find_element_by_tag_name("abbr").get_attribute("title")
time = (
str("%02d" % int(time.split(", ")[1].split()[1]),)
+ "-"
+ str(
(
"%02d"
% (
int(
(
list(calendar.month_abbr).index(
time.split(", ")[1].split()[0][:3]
)
)
),
)
)
)
+ "-"
+ time.split()[3]
+ " "
+ str("%02d" % int(time.split()[5].split(":")[0]))
+ ":"
+ str(time.split()[5].split(":")[1])
)
except Exception:
pass
finally:
return time
def extract_and_write_posts(elements, filename):
try:
f = open(filename, "w", newline="\r\n")
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
time = " "
# time
time = get_time(x)
time = utils.get_time(x)
# title
title = get_title(x)
title = utils.get_title(x, selectors)
if title.text.find("shared a memory") != -1:
x = x.find_element_by_xpath(selectors.get("title_element"))
title = get_title(x)
title = utils.get_title(x, selectors)
status = get_status(x)
status = utils.get_status(x, selectors)
if (
title.text
== driver.find_element_by_id(selectors.get("title_text")).text
):
if status == "":
temp = get_div_links(x, "img")
temp = utils.get_div_links(x, "img", selectors)
if (
temp == ""
): # no image tag which means . it is not a life event
link = get_div_links(x, "a").get_attribute("href")
link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
type = "status update without text"
else:
type = "life event"
link = get_div_links(x, "a").get_attribute("href")
status = get_div_links(x, "a").text
link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
status = utils.get_div_links(x, "a", selectors).text
else:
type = "status update"
if get_div_links(x, "a") != "":
link = get_div_links(x, "a").get_attribute("href")
if utils.get_div_links(x, "a", selectors) != "":
link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
elif title.text.find(" shared ") != -1:
x1, link = get_title_links(title)
x1, link = utils.get_title_links(title)
type = "shared " + x1
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
if title.text.find(" at ") != -1:
x1, link = get_title_links(title)
x1, link = utils.get_title_links(title)
type = "check in"
elif title.text.find(" in ") != 1:
status = get_div_links(x, "a").text
status = utils.get_div_links(x, "a", selectors).text
elif (
title.text.find(" added ") != -1 and title.text.find("photo") != -1
):
type = "added photo"
link = get_div_links(x, "a").get_attribute("href")
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
elif (
title.text.find(" added ") != -1 and title.text.find("video") != -1
):
type = "added video"
link = get_div_links(x, "a").get_attribute("href")
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
else:
type = "others"
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
page = []
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
continue
if save_status != 3:
scroll()
utils.scroll(total_scrolls, driver, selectors, scroll_time)
data = driver.find_elements_by_xpath(elements_path[i])
@ -619,16 +477,9 @@ def create_original_link(url):
return original_link
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
def create_folder(folder):
if not os.path.exists(folder):
os.mkdir(folder)
def scrap_profile(ids):
folder = os.path.join(os.getcwd(), "data")
create_folder(folder)
utils.create_folder(folder)
os.chdir(folder)
# execute for all profiles given in input.txt file
@ -642,137 +493,34 @@ def scrap_profile(ids):
try:
target_dir = os.path.join(folder, user_id.split("/")[-1])
create_folder(target_dir)
utils.create_folder(target_dir)
os.chdir(target_dir)
except Exception:
print("Some error occurred in creating the profile directory.")
continue
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Friends..")
# setting parameters for scrape_data() to scrape friends
scan_list = [
"All",
"Mutual Friends",
"Following",
"Followers",
"Work",
"College",
"Current City",
"Hometown",
]
section = [
"/friends",
"/friends_mutual",
"/following",
"/followers",
"/friends_work",
"/friends_college",
"/friends_current_city",
"/friends_hometown",
]
elements_path = [
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
]
file_names = [
"All Friends.txt",
"Mutual Friends.txt",
"Following.txt",
"Followers.txt",
"Work Friends.txt",
"College Friends.txt",
"Current City Friends.txt",
"Hometown Friends.txt",
]
save_status = 0
to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
for item in to_scrap:
print("----------------------------------------")
print("Scraping {}..".format(item))
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Friends Done!")
if item == "Posts":
scan_list = [None]
elif item == "About":
scan_list = [None] * 7
else:
scan_list = params[item]["scan_list"]
# ----------------------------------------------------------------------------
section = params[item]["section"]
elements_path = params[item]["elements_path"]
file_names = params[item]["file_names"]
save_status = params[item]["save_status"]
print("----------------------------------------")
print("Photos..")
print("Scraping Links..")
# setting parameters for scrape_data() to scrap photos
scan_list = ["'s Photos", "Photos of"]
section = ["/photos_all", "/photos_of"]
elements_path = ["//*[contains(@id, 'pic_')]"] * 2
file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
save_status = 1
scrape_data(
user_id, scan_list, section, elements_path, save_status, file_names
)
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Photos Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Videos:")
# setting parameters for scrape_data() to scrap videos
scan_list = ["'s Videos", "Videos of"]
section = ["/videos_by", "/videos_of"]
elements_path = [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
] * 2
file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
save_status = 2
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Videos Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("About:")
# setting parameters for scrape_data() to scrap the about section
scan_list = [None] * 7
section = [
"/about?section=overview",
"/about?section=education",
"/about?section=living",
"/about?section=contact-info",
"/about?section=relationship",
"/about?section=bio",
"/about?section=year-overviews",
]
elements_path = [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
] * 7
file_names = [
"Overview.txt",
"Work and Education.txt",
"Places Lived.txt",
"Contact and Basic Info.txt",
"Family and Relationships.txt",
"Details About.txt",
"Life Events.txt",
]
save_status = 3
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("About Section Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Posts:")
# setting parameters for scrape_data() to scrap posts
scan_list = [None]
section = []
elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]']
file_names = ["Posts.txt"]
save_status = 4
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Posts(Statuses) Done!")
print("----------------------------------------")
# ----------------------------------------------------------------------------
print("{} Done!".format(item))
print("\nProcess Completed.")
os.chdir("../..")
@ -901,5 +649,73 @@ def scraper(**kwargs):
# -------------------------------------------------------------
if __name__ == "__main__":
ap = argparse.ArgumentParser()
# PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
ap.add_argument(
"-dup",
"--uploaded_photos",
help="download users' uploaded photos?",
default=True,
)
ap.add_argument(
"-dfp", "--friends_photos", help="download users' photos?", default=True
)
ap.add_argument(
"-fss",
"--friends_small_size",
help="Download friends pictures in small size?",
default=True,
)
ap.add_argument(
"-pss",
"--photos_small_size",
help="Download photos in small size?",
default=True,
)
ap.add_argument(
"-ts",
"--total_scrolls",
help="How many times should I scroll down?",
default=2500,
)
ap.add_argument(
"-st", "--scroll_time", help="How much time should I take to scroll?", default=8
)
args = vars(ap.parse_args())
print(args)
# ---------------------------------------------------------
# Global Variables
# ---------------------------------------------------------
# whether to download photos or not
download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
download_friends_photos = utils.to_bool(args["friends_photos"])
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its false then it will open each photo to download it
# and it will take much more time
friends_small_size = utils.to_bool(args["friends_small_size"])
photos_small_size = utils.to_bool(args["photos_small_size"])
total_scrolls = int(args["total_scrolls"])
scroll_time = int(args["scroll_time"])
current_scrolls = 0
old_height = 0
driver = None
CHROMEDRIVER_BINARIES_FOLDER = "bin"
with open("selectors.json") as a, open("params.json") as b:
selectors = json.load(a)
params = json.load(b)
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
# get things rolling
scraper()

139
scraper/utils.py Normal file
View file

@ -0,0 +1,139 @@
import argparse
import os
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
# -----------------------------------------------------------------------------
#
# -----------------------------------------------------------------------------
def to_bool(x):
if x in ["False", "0", 0, False]:
return False
elif x in ["True", "1", 1, True]:
return True
else:
raise argparse.ArgumentTypeError("Boolean value expected")
# -----------------------------------------------------------------------------
#
# -----------------------------------------------------------------------------
def create_folder(folder):
if not os.path.exists(folder):
os.mkdir(folder)
# -------------------------------------------------------------
# Helper functions for Page scrolling
# -------------------------------------------------------------
# check if height changed
def check_height(driver, selectors, old_height):
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
# helper function: used to scroll the page
def scroll(total_scrolls, driver, selectors, scroll_time):
global old_height
current_scrolls = 0
while True:
try:
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height(driver, selectors, old_height)
)
current_scrolls += 1
except TimeoutException:
break
return
# -----------------------------------------------------------------------------
# Helper Functions for Posts
# -----------------------------------------------------------------------------
def get_status(x, selectors):
status = ""
try:
status = x.find_element_by_xpath(
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
def get_div_links(x, tag, selectors):
try:
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
def get_title_links(title):
l = title.find_elements_by_tag_name("a")
return l[-1].text, l[-1].get_attribute("href")
def get_title(x, selectors):
title = ""
try:
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
return title
def get_time(x):
time = ""
try:
time = x.find_element_by_tag_name("abbr").get_attribute("title")
time = (
str("%02d" % int(time.split(", ")[1].split()[1]),)
+ "-"
+ str(
(
"%02d"
% (
int(
(
list(calendar.month_abbr).index(
time.split(", ")[1].split()[0][:3]
)
)
),
)
)
)
+ "-"
+ time.split()[3]
+ " "
+ str("%02d" % int(time.split()[5].split(":")[0]))
+ ":"
+ str(time.split()[5].split(":")[1])
)
except Exception:
pass
finally:
return time