parameters collection for scrapping automated & tested \ command line arguments added & tested \ some enhancements to 2 functions \ utils.py addded, code refactored & tested \ updated readme

This commit is contained in:
Elyes Manai 2020-03-28 17:24:26 +01:00
parent e64ed73afb
commit 43d626514e
3 changed files with 374 additions and 298 deletions

121
params.json Normal file
View file

@ -0,0 +1,121 @@
{
"Friends": {
"scan_list": [
"All",
"Mutual Friends",
"Following",
"Followers",
"Work",
"College",
"Current City",
"Hometown"
],
"section": [
"/friends",
"/friends_mutual",
"/following",
"/followers",
"/friends_work",
"/friends_college",
"/friends_current_city",
"/friends_hometown"
],
"elements_path": [
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
],
"file_names": [
"All Friends.txt",
"Mutual Friends.txt",
"Following.txt",
"Followers.txt",
"Work Friends.txt",
"College Friends.txt",
"Current City Friends.txt",
"Hometown Friends.txt"
],
"save_status":0
},
"Photos": {
"scan_list": [
"'s Photos",
"Photos of"
],
"section": [
"/photos_all",
"/photos_of"
],
"elements_path": [
"//*[contains(@id, 'pic_')]",
"//*[contains(@id, 'pic_')]"
],
"file_names": [
"Uploaded Photos.txt",
"Tagged Photos.txt"
],
"save_status": 1
},
"Videos": {
"scan_list": [
"'s Videos",
"Videos of"
],
"section": [
"/videos_by",
"/videos_of"
],
"elements_path": [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
],
"file_names": [
"Uploaded Videos.txt",
"Tagged Videos.txt"
],
"save_status": 2
},
"About": {
"scan_list": [],
"section": [
"/about?section=overview",
"/about?section=education",
"/about?section=living",
"/about?section=contact-info",
"/about?section=relationship",
"/about?section=bio",
"/about?section=year-overviews"
],
"elements_path": [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
],
"file_names": [
"Overview.txt",
"Work and Education.txt",
"Places Lived.txt",
"Contact and Basic Info.txt",
"Family and Relationships.txt",
"Details About.txt",
"Life Events.txt"
],
"save_status": 3
},
"Posts": {
"scan_list": [],
"section": [],
"elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
"file_names": ["Posts.txt"],
"save_status": 4
}
}

View file

@ -5,6 +5,8 @@ import platform
import sys import sys
import urllib.request import urllib.request
import yaml import yaml
import utils
import argparse
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException from selenium.common.exceptions import TimeoutException, NoSuchElementException
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
# -------------------------------------------------------------
# -------------------------------------------------------------
# Global Variables
driver = None
# whether to download photos or not
download_uploaded_photos = True
download_friends_photos = True
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its false then it will open each photo to download it
# and it will take much more time
friends_small_size = True
photos_small_size = True
total_scrolls = 2500
current_scrolls = 0
scroll_time = 8
with open("selectors.json") as json_file:
selectors = json.load(json_file)
old_height = 0
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
CHROMEDRIVER_BINARIES_FOLDER = "bin"
# -------------------------------------------------------------
# -------------------------------------------------------------
def get_facebook_images_url(img_links): def get_facebook_images_url(img_links):
urls = [] urls = []
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
parent = os.getcwd() parent = os.getcwd()
try: try:
folder = os.path.join(os.getcwd(), folder_name) folder = os.path.join(os.getcwd(), folder_name)
create_folder(folder) utils.create_folder(folder)
os.chdir(folder) os.chdir(folder)
except Exception: except Exception:
print("Error in changing directory.") print("Error in changing directory.")
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
# ------------------------------------------------------------- # -------------------------------------------------------------
def check_height():
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
# -------------------------------------------------------------
# -------------------------------------------------------------
# helper function: used to scroll the page
def scroll():
global old_height
current_scrolls = 0
while True:
try:
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height()
)
current_scrolls += 1
except TimeoutException:
break
return
# -------------------------------------------------------------
# -------------------------------------------------------------
# --Helper Functions for Posts
def get_status(x):
status = ""
try:
status = x.find_element_by_xpath(
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
def get_div_links(x, tag):
try:
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
def get_title_links(title):
l = title.find_elements_by_tag_name("a")
return l[-1].text, l[-1].get_attribute("href")
def get_title(x):
title = ""
try:
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
return title
def get_time(x):
time = ""
try:
time = x.find_element_by_tag_name("abbr").get_attribute("title")
time = (
str("%02d" % int(time.split(", ")[1].split()[1]),)
+ "-"
+ str(
(
"%02d"
% (
int(
(
list(calendar.month_abbr).index(
time.split(", ")[1].split()[0][:3]
)
)
),
)
)
)
+ "-"
+ time.split()[3]
+ " "
+ str("%02d" % int(time.split()[5].split(":")[0]))
+ ":"
+ str(time.split()[5].split(":")[1])
)
except Exception:
pass
finally:
return time
def extract_and_write_posts(elements, filename): def extract_and_write_posts(elements, filename):
try: try:
f = open(filename, "w", newline="\r\n") f = open(filename, "w", newline="\r\n")
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
time = " " time = " "
# time # time
time = get_time(x) time = utils.get_time(x)
# title # title
title = get_title(x) title = utils.get_title(x, selectors)
if title.text.find("shared a memory") != -1: if title.text.find("shared a memory") != -1:
x = x.find_element_by_xpath(selectors.get("title_element")) x = x.find_element_by_xpath(selectors.get("title_element"))
title = get_title(x) title = utils.get_title(x, selectors)
status = get_status(x) status = utils.get_status(x, selectors)
if ( if (
title.text title.text
== driver.find_element_by_id(selectors.get("title_text")).text == driver.find_element_by_id(selectors.get("title_text")).text
): ):
if status == "": if status == "":
temp = get_div_links(x, "img") temp = utils.get_div_links(x, "img", selectors)
if ( if (
temp == "" temp == ""
): # no image tag which means . it is not a life event ): # no image tag which means . it is not a life event
link = get_div_links(x, "a").get_attribute("href") link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
type = "status update without text" type = "status update without text"
else: else:
type = "life event" type = "life event"
link = get_div_links(x, "a").get_attribute("href") link = utils.get_div_links(x, "a", selectors).get_attribute(
status = get_div_links(x, "a").text "href"
)
status = utils.get_div_links(x, "a", selectors).text
else: else:
type = "status update" type = "status update"
if get_div_links(x, "a") != "": if utils.get_div_links(x, "a", selectors) != "":
link = get_div_links(x, "a").get_attribute("href") link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
elif title.text.find(" shared ") != -1: elif title.text.find(" shared ") != -1:
x1, link = get_title_links(title) x1, link = utils.get_title_links(title)
type = "shared " + x1 type = "shared " + x1
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1: elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
if title.text.find(" at ") != -1: if title.text.find(" at ") != -1:
x1, link = get_title_links(title) x1, link = utils.get_title_links(title)
type = "check in" type = "check in"
elif title.text.find(" in ") != 1: elif title.text.find(" in ") != 1:
status = get_div_links(x, "a").text status = utils.get_div_links(x, "a", selectors).text
elif ( elif (
title.text.find(" added ") != -1 and title.text.find("photo") != -1 title.text.find(" added ") != -1 and title.text.find("photo") != -1
): ):
type = "added photo" type = "added photo"
link = get_div_links(x, "a").get_attribute("href") link = utils.get_div_links(x, "a", selectors).get_attribute("href")
elif ( elif (
title.text.find(" added ") != -1 and title.text.find("video") != -1 title.text.find(" added ") != -1 and title.text.find("video") != -1
): ):
type = "added video" type = "added video"
link = get_div_links(x, "a").get_attribute("href") link = utils.get_div_links(x, "a", selectors).get_attribute("href")
else: else:
type = "others" type = "others"
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names): def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile""" """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
page = [] page = []
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
continue continue
if save_status != 3: if save_status != 3:
scroll() utils.scroll(total_scrolls, driver, selectors, scroll_time)
data = driver.find_elements_by_xpath(elements_path[i]) data = driver.find_elements_by_xpath(elements_path[i])
@ -619,16 +477,9 @@ def create_original_link(url):
return original_link return original_link
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
def create_folder(folder):
if not os.path.exists(folder):
os.mkdir(folder)
def scrap_profile(ids): def scrap_profile(ids):
folder = os.path.join(os.getcwd(), "data") folder = os.path.join(os.getcwd(), "data")
create_folder(folder) utils.create_folder(folder)
os.chdir(folder) os.chdir(folder)
# execute for all profiles given in input.txt file # execute for all profiles given in input.txt file
@ -642,137 +493,34 @@ def scrap_profile(ids):
try: try:
target_dir = os.path.join(folder, user_id.split("/")[-1]) target_dir = os.path.join(folder, user_id.split("/")[-1])
create_folder(target_dir) utils.create_folder(target_dir)
os.chdir(target_dir) os.chdir(target_dir)
except Exception: except Exception:
print("Some error occurred in creating the profile directory.") print("Some error occurred in creating the profile directory.")
continue continue
# ---------------------------------------------------------------------------- to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
for item in to_scrap:
print("----------------------------------------") print("----------------------------------------")
print("Friends..") print("Scraping {}..".format(item))
# setting parameters for scrape_data() to scrape friends
scan_list = [
"All",
"Mutual Friends",
"Following",
"Followers",
"Work",
"College",
"Current City",
"Hometown",
]
section = [
"/friends",
"/friends_mutual",
"/following",
"/followers",
"/friends_work",
"/friends_college",
"/friends_current_city",
"/friends_hometown",
]
elements_path = [
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
]
file_names = [
"All Friends.txt",
"Mutual Friends.txt",
"Following.txt",
"Followers.txt",
"Work Friends.txt",
"College Friends.txt",
"Current City Friends.txt",
"Hometown Friends.txt",
]
save_status = 0
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names) if item == "Posts":
print("Friends Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Photos..")
print("Scraping Links..")
# setting parameters for scrape_data() to scrap photos
scan_list = ["'s Photos", "Photos of"]
section = ["/photos_all", "/photos_of"]
elements_path = ["//*[contains(@id, 'pic_')]"] * 2
file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
save_status = 1
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Photos Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Videos:")
# setting parameters for scrape_data() to scrap videos
scan_list = ["'s Videos", "Videos of"]
section = ["/videos_by", "/videos_of"]
elements_path = [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
] * 2
file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
save_status = 2
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Videos Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("About:")
# setting parameters for scrape_data() to scrap the about section
scan_list = [None] * 7
section = [
"/about?section=overview",
"/about?section=education",
"/about?section=living",
"/about?section=contact-info",
"/about?section=relationship",
"/about?section=bio",
"/about?section=year-overviews",
]
elements_path = [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
] * 7
file_names = [
"Overview.txt",
"Work and Education.txt",
"Places Lived.txt",
"Contact and Basic Info.txt",
"Family and Relationships.txt",
"Details About.txt",
"Life Events.txt",
]
save_status = 3
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("About Section Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Posts:")
# setting parameters for scrape_data() to scrap posts
scan_list = [None] scan_list = [None]
section = [] elif item == "About":
elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]'] scan_list = [None] * 7
else:
scan_list = params[item]["scan_list"]
file_names = ["Posts.txt"] section = params[item]["section"]
save_status = 4 elements_path = params[item]["elements_path"]
file_names = params[item]["file_names"]
save_status = params[item]["save_status"]
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names) scrape_data(
print("Posts(Statuses) Done!") user_id, scan_list, section, elements_path, save_status, file_names
print("----------------------------------------") )
# ----------------------------------------------------------------------------
print("{} Done!".format(item))
print("\nProcess Completed.") print("\nProcess Completed.")
os.chdir("../..") os.chdir("../..")
@ -901,5 +649,73 @@ def scraper(**kwargs):
# ------------------------------------------------------------- # -------------------------------------------------------------
if __name__ == "__main__": if __name__ == "__main__":
ap = argparse.ArgumentParser()
# PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
ap.add_argument(
"-dup",
"--uploaded_photos",
help="download users' uploaded photos?",
default=True,
)
ap.add_argument(
"-dfp", "--friends_photos", help="download users' photos?", default=True
)
ap.add_argument(
"-fss",
"--friends_small_size",
help="Download friends pictures in small size?",
default=True,
)
ap.add_argument(
"-pss",
"--photos_small_size",
help="Download photos in small size?",
default=True,
)
ap.add_argument(
"-ts",
"--total_scrolls",
help="How many times should I scroll down?",
default=2500,
)
ap.add_argument(
"-st", "--scroll_time", help="How much time should I take to scroll?", default=8
)
args = vars(ap.parse_args())
print(args)
# ---------------------------------------------------------
# Global Variables
# ---------------------------------------------------------
# whether to download photos or not
download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
download_friends_photos = utils.to_bool(args["friends_photos"])
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its false then it will open each photo to download it
# and it will take much more time
friends_small_size = utils.to_bool(args["friends_small_size"])
photos_small_size = utils.to_bool(args["photos_small_size"])
total_scrolls = int(args["total_scrolls"])
scroll_time = int(args["scroll_time"])
current_scrolls = 0
old_height = 0
driver = None
CHROMEDRIVER_BINARIES_FOLDER = "bin"
with open("selectors.json") as a, open("params.json") as b:
selectors = json.load(a)
params = json.load(b)
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
# get things rolling # get things rolling
scraper() scraper()

139
scraper/utils.py Normal file
View file

@ -0,0 +1,139 @@
import argparse
import os
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
# -----------------------------------------------------------------------------
#
# -----------------------------------------------------------------------------
def to_bool(x):
if x in ["False", "0", 0, False]:
return False
elif x in ["True", "1", 1, True]:
return True
else:
raise argparse.ArgumentTypeError("Boolean value expected")
# -----------------------------------------------------------------------------
#
# -----------------------------------------------------------------------------
def create_folder(folder):
if not os.path.exists(folder):
os.mkdir(folder)
# -------------------------------------------------------------
# Helper functions for Page scrolling
# -------------------------------------------------------------
# check if height changed
def check_height(driver, selectors, old_height):
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
# helper function: used to scroll the page
def scroll(total_scrolls, driver, selectors, scroll_time):
global old_height
current_scrolls = 0
while True:
try:
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height(driver, selectors, old_height)
)
current_scrolls += 1
except TimeoutException:
break
return
# -----------------------------------------------------------------------------
# Helper Functions for Posts
# -----------------------------------------------------------------------------
def get_status(x, selectors):
status = ""
try:
status = x.find_element_by_xpath(
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
def get_div_links(x, tag, selectors):
try:
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
def get_title_links(title):
l = title.find_elements_by_tag_name("a")
return l[-1].text, l[-1].get_attribute("href")
def get_title(x, selectors):
title = ""
try:
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
return title
def get_time(x):
time = ""
try:
time = x.find_element_by_tag_name("abbr").get_attribute("title")
time = (
str("%02d" % int(time.split(", ")[1].split()[1]),)
+ "-"
+ str(
(
"%02d"
% (
int(
(
list(calendar.month_abbr).index(
time.split(", ")[1].split()[0][:3]
)
)
),
)
)
)
+ "-"
+ time.split()[3]
+ " "
+ str("%02d" % int(time.split()[5].split(":")[0]))
+ ":"
+ str(time.split()[5].split(":")[1])
)
except Exception:
pass
finally:
return time