mirror of
https://github.com/harismuneer/Ultimate-Facebook-Scraper
synced 2024-11-10 14:14:23 +00:00
parameters collection for scrapping automated & tested \ command line arguments added & tested \ some enhancements to 2 functions \ utils.py addded, code refactored & tested \ updated readme
This commit is contained in:
parent
e64ed73afb
commit
43d626514e
3 changed files with 374 additions and 298 deletions
121
params.json
Normal file
121
params.json
Normal file
|
@ -0,0 +1,121 @@
|
|||
{
|
||||
"Friends": {
|
||||
"scan_list": [
|
||||
"All",
|
||||
"Mutual Friends",
|
||||
"Following",
|
||||
"Followers",
|
||||
"Work",
|
||||
"College",
|
||||
"Current City",
|
||||
"Hometown"
|
||||
],
|
||||
"section": [
|
||||
"/friends",
|
||||
"/friends_mutual",
|
||||
"/following",
|
||||
"/followers",
|
||||
"/friends_work",
|
||||
"/friends_college",
|
||||
"/friends_current_city",
|
||||
"/friends_hometown"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
|
||||
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
|
||||
],
|
||||
"file_names": [
|
||||
"All Friends.txt",
|
||||
"Mutual Friends.txt",
|
||||
"Following.txt",
|
||||
"Followers.txt",
|
||||
"Work Friends.txt",
|
||||
"College Friends.txt",
|
||||
"Current City Friends.txt",
|
||||
"Hometown Friends.txt"
|
||||
],
|
||||
"save_status":0
|
||||
},
|
||||
"Photos": {
|
||||
"scan_list": [
|
||||
"'s Photos",
|
||||
"Photos of"
|
||||
],
|
||||
"section": [
|
||||
"/photos_all",
|
||||
"/photos_of"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id, 'pic_')]",
|
||||
"//*[contains(@id, 'pic_')]"
|
||||
],
|
||||
"file_names": [
|
||||
"Uploaded Photos.txt",
|
||||
"Tagged Photos.txt"
|
||||
],
|
||||
"save_status": 1
|
||||
},
|
||||
"Videos": {
|
||||
"scan_list": [
|
||||
"'s Videos",
|
||||
"Videos of"
|
||||
],
|
||||
"section": [
|
||||
"/videos_by",
|
||||
"/videos_of"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
|
||||
],
|
||||
"file_names": [
|
||||
"Uploaded Videos.txt",
|
||||
"Tagged Videos.txt"
|
||||
],
|
||||
"save_status": 2
|
||||
},
|
||||
"About": {
|
||||
"scan_list": [],
|
||||
"section": [
|
||||
"/about?section=overview",
|
||||
"/about?section=education",
|
||||
"/about?section=living",
|
||||
"/about?section=contact-info",
|
||||
"/about?section=relationship",
|
||||
"/about?section=bio",
|
||||
"/about?section=year-overviews"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
|
||||
],
|
||||
"file_names": [
|
||||
"Overview.txt",
|
||||
"Work and Education.txt",
|
||||
"Places Lived.txt",
|
||||
"Contact and Basic Info.txt",
|
||||
"Family and Relationships.txt",
|
||||
"Details About.txt",
|
||||
"Life Events.txt"
|
||||
],
|
||||
"save_status": 3
|
||||
},
|
||||
"Posts": {
|
||||
"scan_list": [],
|
||||
"section": [],
|
||||
"elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
|
||||
"file_names": ["Posts.txt"],
|
||||
"save_status": 4
|
||||
}
|
||||
}
|
|
@ -5,6 +5,8 @@ import platform
|
|||
import sys
|
||||
import urllib.request
|
||||
import yaml
|
||||
import utils
|
||||
import argparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
|
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
|
|||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
|
||||
# Global Variables
|
||||
|
||||
driver = None
|
||||
|
||||
# whether to download photos or not
|
||||
download_uploaded_photos = True
|
||||
download_friends_photos = True
|
||||
|
||||
# whether to download the full image or its thumbnail (small size)
|
||||
# if small size is True then it will be very quick else if its false then it will open each photo to download it
|
||||
# and it will take much more time
|
||||
friends_small_size = True
|
||||
photos_small_size = True
|
||||
|
||||
total_scrolls = 2500
|
||||
current_scrolls = 0
|
||||
scroll_time = 8
|
||||
|
||||
with open("selectors.json") as json_file:
|
||||
selectors = json.load(json_file)
|
||||
|
||||
old_height = 0
|
||||
firefox_profile_path = selectors.get("firefox_profile_path")
|
||||
facebook_https_prefix = selectors.get("facebook_https_prefix")
|
||||
facebook_link_body = selectors.get("facebook_link_body")
|
||||
|
||||
|
||||
CHROMEDRIVER_BINARIES_FOLDER = "bin"
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
|
||||
def get_facebook_images_url(img_links):
|
||||
urls = []
|
||||
|
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
|
|||
parent = os.getcwd()
|
||||
try:
|
||||
folder = os.path.join(os.getcwd(), folder_name)
|
||||
create_folder(folder)
|
||||
utils.create_folder(folder)
|
||||
os.chdir(folder)
|
||||
except Exception:
|
||||
print("Error in changing directory.")
|
||||
|
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
|
|||
# -------------------------------------------------------------
|
||||
|
||||
|
||||
def check_height():
|
||||
new_height = driver.execute_script(selectors.get("height_script"))
|
||||
return new_height != old_height
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
# helper function: used to scroll the page
|
||||
def scroll():
|
||||
global old_height
|
||||
current_scrolls = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
if current_scrolls == total_scrolls:
|
||||
return
|
||||
|
||||
old_height = driver.execute_script(selectors.get("height_script"))
|
||||
driver.execute_script(selectors.get("scroll_script"))
|
||||
WebDriverWait(driver, scroll_time, 0.05).until(
|
||||
lambda driver: check_height()
|
||||
)
|
||||
current_scrolls += 1
|
||||
except TimeoutException:
|
||||
break
|
||||
|
||||
return
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
# --Helper Functions for Posts
|
||||
|
||||
|
||||
def get_status(x):
|
||||
status = ""
|
||||
try:
|
||||
status = x.find_element_by_xpath(
|
||||
selectors.get("status")
|
||||
).text # use _1xnd for Pages
|
||||
except Exception:
|
||||
try:
|
||||
status = x.find_element_by_xpath(selectors.get("status_exc")).text
|
||||
except Exception:
|
||||
pass
|
||||
return status
|
||||
|
||||
|
||||
def get_div_links(x, tag):
|
||||
try:
|
||||
temp = x.find_element_by_xpath(selectors.get("temp"))
|
||||
return temp.find_element_by_tag_name(tag)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def get_title_links(title):
|
||||
l = title.find_elements_by_tag_name("a")
|
||||
return l[-1].text, l[-1].get_attribute("href")
|
||||
|
||||
|
||||
def get_title(x):
|
||||
title = ""
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc1"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc2"))
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
return title
|
||||
|
||||
|
||||
def get_time(x):
|
||||
time = ""
|
||||
try:
|
||||
time = x.find_element_by_tag_name("abbr").get_attribute("title")
|
||||
time = (
|
||||
str("%02d" % int(time.split(", ")[1].split()[1]),)
|
||||
+ "-"
|
||||
+ str(
|
||||
(
|
||||
"%02d"
|
||||
% (
|
||||
int(
|
||||
(
|
||||
list(calendar.month_abbr).index(
|
||||
time.split(", ")[1].split()[0][:3]
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
+ "-"
|
||||
+ time.split()[3]
|
||||
+ " "
|
||||
+ str("%02d" % int(time.split()[5].split(":")[0]))
|
||||
+ ":"
|
||||
+ str(time.split()[5].split(":")[1])
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
finally:
|
||||
return time
|
||||
|
||||
|
||||
def extract_and_write_posts(elements, filename):
|
||||
try:
|
||||
f = open(filename, "w", newline="\r\n")
|
||||
|
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
|
|||
time = " "
|
||||
|
||||
# time
|
||||
time = get_time(x)
|
||||
time = utils.get_time(x)
|
||||
|
||||
# title
|
||||
title = get_title(x)
|
||||
title = utils.get_title(x, selectors)
|
||||
if title.text.find("shared a memory") != -1:
|
||||
x = x.find_element_by_xpath(selectors.get("title_element"))
|
||||
title = get_title(x)
|
||||
title = utils.get_title(x, selectors)
|
||||
|
||||
status = get_status(x)
|
||||
status = utils.get_status(x, selectors)
|
||||
if (
|
||||
title.text
|
||||
== driver.find_element_by_id(selectors.get("title_text")).text
|
||||
):
|
||||
if status == "":
|
||||
temp = get_div_links(x, "img")
|
||||
temp = utils.get_div_links(x, "img", selectors)
|
||||
if (
|
||||
temp == ""
|
||||
): # no image tag which means . it is not a life event
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||
"href"
|
||||
)
|
||||
type = "status update without text"
|
||||
else:
|
||||
type = "life event"
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
status = get_div_links(x, "a").text
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||
"href"
|
||||
)
|
||||
status = utils.get_div_links(x, "a", selectors).text
|
||||
else:
|
||||
type = "status update"
|
||||
if get_div_links(x, "a") != "":
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
if utils.get_div_links(x, "a", selectors) != "":
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||
"href"
|
||||
)
|
||||
|
||||
elif title.text.find(" shared ") != -1:
|
||||
|
||||
x1, link = get_title_links(title)
|
||||
x1, link = utils.get_title_links(title)
|
||||
type = "shared " + x1
|
||||
|
||||
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
|
||||
if title.text.find(" at ") != -1:
|
||||
x1, link = get_title_links(title)
|
||||
x1, link = utils.get_title_links(title)
|
||||
type = "check in"
|
||||
elif title.text.find(" in ") != 1:
|
||||
status = get_div_links(x, "a").text
|
||||
status = utils.get_div_links(x, "a", selectors).text
|
||||
|
||||
elif (
|
||||
title.text.find(" added ") != -1 and title.text.find("photo") != -1
|
||||
):
|
||||
type = "added photo"
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
|
||||
|
||||
elif (
|
||||
title.text.find(" added ") != -1 and title.text.find("video") != -1
|
||||
):
|
||||
type = "added video"
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
|
||||
|
||||
else:
|
||||
type = "others"
|
||||
|
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
|
|||
|
||||
|
||||
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
|
||||
|
||||
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
|
||||
page = []
|
||||
|
||||
|
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
|
|||
continue
|
||||
|
||||
if save_status != 3:
|
||||
scroll()
|
||||
utils.scroll(total_scrolls, driver, selectors, scroll_time)
|
||||
|
||||
data = driver.find_elements_by_xpath(elements_path[i])
|
||||
|
||||
|
@ -619,16 +477,9 @@ def create_original_link(url):
|
|||
return original_link
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# -----------------------------------------------------------------------------
|
||||
def create_folder(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.mkdir(folder)
|
||||
|
||||
|
||||
def scrap_profile(ids):
|
||||
folder = os.path.join(os.getcwd(), "data")
|
||||
create_folder(folder)
|
||||
utils.create_folder(folder)
|
||||
os.chdir(folder)
|
||||
|
||||
# execute for all profiles given in input.txt file
|
||||
|
@ -642,137 +493,34 @@ def scrap_profile(ids):
|
|||
|
||||
try:
|
||||
target_dir = os.path.join(folder, user_id.split("/")[-1])
|
||||
create_folder(target_dir)
|
||||
utils.create_folder(target_dir)
|
||||
os.chdir(target_dir)
|
||||
except Exception:
|
||||
print("Some error occurred in creating the profile directory.")
|
||||
continue
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
print("----------------------------------------")
|
||||
print("Friends..")
|
||||
# setting parameters for scrape_data() to scrape friends
|
||||
scan_list = [
|
||||
"All",
|
||||
"Mutual Friends",
|
||||
"Following",
|
||||
"Followers",
|
||||
"Work",
|
||||
"College",
|
||||
"Current City",
|
||||
"Hometown",
|
||||
]
|
||||
section = [
|
||||
"/friends",
|
||||
"/friends_mutual",
|
||||
"/following",
|
||||
"/followers",
|
||||
"/friends_work",
|
||||
"/friends_college",
|
||||
"/friends_current_city",
|
||||
"/friends_hometown",
|
||||
]
|
||||
elements_path = [
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
|
||||
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
]
|
||||
file_names = [
|
||||
"All Friends.txt",
|
||||
"Mutual Friends.txt",
|
||||
"Following.txt",
|
||||
"Followers.txt",
|
||||
"Work Friends.txt",
|
||||
"College Friends.txt",
|
||||
"Current City Friends.txt",
|
||||
"Hometown Friends.txt",
|
||||
]
|
||||
save_status = 0
|
||||
to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
|
||||
for item in to_scrap:
|
||||
print("----------------------------------------")
|
||||
print("Scraping {}..".format(item))
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Friends Done!")
|
||||
if item == "Posts":
|
||||
scan_list = [None]
|
||||
elif item == "About":
|
||||
scan_list = [None] * 7
|
||||
else:
|
||||
scan_list = params[item]["scan_list"]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
section = params[item]["section"]
|
||||
elements_path = params[item]["elements_path"]
|
||||
file_names = params[item]["file_names"]
|
||||
save_status = params[item]["save_status"]
|
||||
|
||||
print("----------------------------------------")
|
||||
print("Photos..")
|
||||
print("Scraping Links..")
|
||||
# setting parameters for scrape_data() to scrap photos
|
||||
scan_list = ["'s Photos", "Photos of"]
|
||||
section = ["/photos_all", "/photos_of"]
|
||||
elements_path = ["//*[contains(@id, 'pic_')]"] * 2
|
||||
file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
|
||||
save_status = 1
|
||||
scrape_data(
|
||||
user_id, scan_list, section, elements_path, save_status, file_names
|
||||
)
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Photos Done!")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
print("----------------------------------------")
|
||||
print("Videos:")
|
||||
# setting parameters for scrape_data() to scrap videos
|
||||
scan_list = ["'s Videos", "Videos of"]
|
||||
section = ["/videos_by", "/videos_of"]
|
||||
elements_path = [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
|
||||
] * 2
|
||||
file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
|
||||
save_status = 2
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Videos Done!")
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
print("----------------------------------------")
|
||||
print("About:")
|
||||
# setting parameters for scrape_data() to scrap the about section
|
||||
scan_list = [None] * 7
|
||||
section = [
|
||||
"/about?section=overview",
|
||||
"/about?section=education",
|
||||
"/about?section=living",
|
||||
"/about?section=contact-info",
|
||||
"/about?section=relationship",
|
||||
"/about?section=bio",
|
||||
"/about?section=year-overviews",
|
||||
]
|
||||
elements_path = [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
|
||||
] * 7
|
||||
file_names = [
|
||||
"Overview.txt",
|
||||
"Work and Education.txt",
|
||||
"Places Lived.txt",
|
||||
"Contact and Basic Info.txt",
|
||||
"Family and Relationships.txt",
|
||||
"Details About.txt",
|
||||
"Life Events.txt",
|
||||
]
|
||||
save_status = 3
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("About Section Done!")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
print("----------------------------------------")
|
||||
print("Posts:")
|
||||
# setting parameters for scrape_data() to scrap posts
|
||||
scan_list = [None]
|
||||
section = []
|
||||
elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]']
|
||||
|
||||
file_names = ["Posts.txt"]
|
||||
save_status = 4
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Posts(Statuses) Done!")
|
||||
print("----------------------------------------")
|
||||
# ----------------------------------------------------------------------------
|
||||
print("{} Done!".format(item))
|
||||
|
||||
print("\nProcess Completed.")
|
||||
os.chdir("../..")
|
||||
|
@ -901,5 +649,73 @@ def scraper(**kwargs):
|
|||
# -------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
# PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
|
||||
ap.add_argument(
|
||||
"-dup",
|
||||
"--uploaded_photos",
|
||||
help="download users' uploaded photos?",
|
||||
default=True,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-dfp", "--friends_photos", help="download users' photos?", default=True
|
||||
)
|
||||
ap.add_argument(
|
||||
"-fss",
|
||||
"--friends_small_size",
|
||||
help="Download friends pictures in small size?",
|
||||
default=True,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-pss",
|
||||
"--photos_small_size",
|
||||
help="Download photos in small size?",
|
||||
default=True,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-ts",
|
||||
"--total_scrolls",
|
||||
help="How many times should I scroll down?",
|
||||
default=2500,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-st", "--scroll_time", help="How much time should I take to scroll?", default=8
|
||||
)
|
||||
|
||||
args = vars(ap.parse_args())
|
||||
print(args)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Global Variables
|
||||
# ---------------------------------------------------------
|
||||
|
||||
# whether to download photos or not
|
||||
download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
|
||||
download_friends_photos = utils.to_bool(args["friends_photos"])
|
||||
|
||||
# whether to download the full image or its thumbnail (small size)
|
||||
# if small size is True then it will be very quick else if its false then it will open each photo to download it
|
||||
# and it will take much more time
|
||||
friends_small_size = utils.to_bool(args["friends_small_size"])
|
||||
photos_small_size = utils.to_bool(args["photos_small_size"])
|
||||
|
||||
total_scrolls = int(args["total_scrolls"])
|
||||
scroll_time = int(args["scroll_time"])
|
||||
|
||||
current_scrolls = 0
|
||||
old_height = 0
|
||||
|
||||
driver = None
|
||||
CHROMEDRIVER_BINARIES_FOLDER = "bin"
|
||||
|
||||
with open("selectors.json") as a, open("params.json") as b:
|
||||
selectors = json.load(a)
|
||||
params = json.load(b)
|
||||
|
||||
firefox_profile_path = selectors.get("firefox_profile_path")
|
||||
facebook_https_prefix = selectors.get("facebook_https_prefix")
|
||||
facebook_link_body = selectors.get("facebook_link_body")
|
||||
|
||||
# get things rolling
|
||||
scraper()
|
||||
|
|
139
scraper/utils.py
Normal file
139
scraper/utils.py
Normal file
|
@ -0,0 +1,139 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
def to_bool(x):
|
||||
if x in ["False", "0", 0, False]:
|
||||
return False
|
||||
elif x in ["True", "1", 1, True]:
|
||||
return True
|
||||
else:
|
||||
raise argparse.ArgumentTypeError("Boolean value expected")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
def create_folder(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.mkdir(folder)
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Helper functions for Page scrolling
|
||||
# -------------------------------------------------------------
|
||||
# check if height changed
|
||||
def check_height(driver, selectors, old_height):
|
||||
new_height = driver.execute_script(selectors.get("height_script"))
|
||||
return new_height != old_height
|
||||
|
||||
|
||||
# helper function: used to scroll the page
|
||||
def scroll(total_scrolls, driver, selectors, scroll_time):
|
||||
global old_height
|
||||
current_scrolls = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
if current_scrolls == total_scrolls:
|
||||
return
|
||||
|
||||
old_height = driver.execute_script(selectors.get("height_script"))
|
||||
driver.execute_script(selectors.get("scroll_script"))
|
||||
WebDriverWait(driver, scroll_time, 0.05).until(
|
||||
lambda driver: check_height(driver, selectors, old_height)
|
||||
)
|
||||
current_scrolls += 1
|
||||
except TimeoutException:
|
||||
break
|
||||
|
||||
return
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper Functions for Posts
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_status(x, selectors):
|
||||
status = ""
|
||||
try:
|
||||
status = x.find_element_by_xpath(
|
||||
selectors.get("status")
|
||||
).text # use _1xnd for Pages
|
||||
except Exception:
|
||||
try:
|
||||
status = x.find_element_by_xpath(selectors.get("status_exc")).text
|
||||
except Exception:
|
||||
pass
|
||||
return status
|
||||
|
||||
|
||||
def get_div_links(x, tag, selectors):
|
||||
try:
|
||||
temp = x.find_element_by_xpath(selectors.get("temp"))
|
||||
return temp.find_element_by_tag_name(tag)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def get_title_links(title):
|
||||
l = title.find_elements_by_tag_name("a")
|
||||
return l[-1].text, l[-1].get_attribute("href")
|
||||
|
||||
|
||||
def get_title(x, selectors):
|
||||
title = ""
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc1"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc2"))
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
return title
|
||||
|
||||
|
||||
def get_time(x):
|
||||
time = ""
|
||||
try:
|
||||
time = x.find_element_by_tag_name("abbr").get_attribute("title")
|
||||
time = (
|
||||
str("%02d" % int(time.split(", ")[1].split()[1]),)
|
||||
+ "-"
|
||||
+ str(
|
||||
(
|
||||
"%02d"
|
||||
% (
|
||||
int(
|
||||
(
|
||||
list(calendar.month_abbr).index(
|
||||
time.split(", ")[1].split()[0][:3]
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
+ "-"
|
||||
+ time.split()[3]
|
||||
+ " "
|
||||
+ str("%02d" % int(time.split()[5].split(":")[0]))
|
||||
+ ":"
|
||||
+ str(time.split()[5].split(":")[1])
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
finally:
|
||||
return time
|
Loading…
Reference in a new issue