mirror of
https://github.com/harismuneer/Ultimate-Facebook-Scraper
synced 2024-11-10 14:14:23 +00:00
parameters collection for scrapping automated & tested \ command line arguments added & tested \ some enhancements to 2 functions \ utils.py addded, code refactored & tested \ updated readme
This commit is contained in:
parent
e64ed73afb
commit
43d626514e
3 changed files with 374 additions and 298 deletions
121
params.json
Normal file
121
params.json
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
{
|
||||||
|
"Friends": {
|
||||||
|
"scan_list": [
|
||||||
|
"All",
|
||||||
|
"Mutual Friends",
|
||||||
|
"Following",
|
||||||
|
"Followers",
|
||||||
|
"Work",
|
||||||
|
"College",
|
||||||
|
"Current City",
|
||||||
|
"Hometown"
|
||||||
|
],
|
||||||
|
"section": [
|
||||||
|
"/friends",
|
||||||
|
"/friends_mutual",
|
||||||
|
"/following",
|
||||||
|
"/followers",
|
||||||
|
"/friends_work",
|
||||||
|
"/friends_college",
|
||||||
|
"/friends_current_city",
|
||||||
|
"/friends_hometown"
|
||||||
|
],
|
||||||
|
"elements_path": [
|
||||||
|
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||||
|
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||||
|
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
|
||||||
|
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
|
||||||
|
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||||
|
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||||
|
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||||
|
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
|
||||||
|
],
|
||||||
|
"file_names": [
|
||||||
|
"All Friends.txt",
|
||||||
|
"Mutual Friends.txt",
|
||||||
|
"Following.txt",
|
||||||
|
"Followers.txt",
|
||||||
|
"Work Friends.txt",
|
||||||
|
"College Friends.txt",
|
||||||
|
"Current City Friends.txt",
|
||||||
|
"Hometown Friends.txt"
|
||||||
|
],
|
||||||
|
"save_status":0
|
||||||
|
},
|
||||||
|
"Photos": {
|
||||||
|
"scan_list": [
|
||||||
|
"'s Photos",
|
||||||
|
"Photos of"
|
||||||
|
],
|
||||||
|
"section": [
|
||||||
|
"/photos_all",
|
||||||
|
"/photos_of"
|
||||||
|
],
|
||||||
|
"elements_path": [
|
||||||
|
"//*[contains(@id, 'pic_')]",
|
||||||
|
"//*[contains(@id, 'pic_')]"
|
||||||
|
],
|
||||||
|
"file_names": [
|
||||||
|
"Uploaded Photos.txt",
|
||||||
|
"Tagged Photos.txt"
|
||||||
|
],
|
||||||
|
"save_status": 1
|
||||||
|
},
|
||||||
|
"Videos": {
|
||||||
|
"scan_list": [
|
||||||
|
"'s Videos",
|
||||||
|
"Videos of"
|
||||||
|
],
|
||||||
|
"section": [
|
||||||
|
"/videos_by",
|
||||||
|
"/videos_of"
|
||||||
|
],
|
||||||
|
"elements_path": [
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
|
||||||
|
],
|
||||||
|
"file_names": [
|
||||||
|
"Uploaded Videos.txt",
|
||||||
|
"Tagged Videos.txt"
|
||||||
|
],
|
||||||
|
"save_status": 2
|
||||||
|
},
|
||||||
|
"About": {
|
||||||
|
"scan_list": [],
|
||||||
|
"section": [
|
||||||
|
"/about?section=overview",
|
||||||
|
"/about?section=education",
|
||||||
|
"/about?section=living",
|
||||||
|
"/about?section=contact-info",
|
||||||
|
"/about?section=relationship",
|
||||||
|
"/about?section=bio",
|
||||||
|
"/about?section=year-overviews"
|
||||||
|
],
|
||||||
|
"elements_path": [
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||||
|
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
|
||||||
|
],
|
||||||
|
"file_names": [
|
||||||
|
"Overview.txt",
|
||||||
|
"Work and Education.txt",
|
||||||
|
"Places Lived.txt",
|
||||||
|
"Contact and Basic Info.txt",
|
||||||
|
"Family and Relationships.txt",
|
||||||
|
"Details About.txt",
|
||||||
|
"Life Events.txt"
|
||||||
|
],
|
||||||
|
"save_status": 3
|
||||||
|
},
|
||||||
|
"Posts": {
|
||||||
|
"scan_list": [],
|
||||||
|
"section": [],
|
||||||
|
"elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
|
||||||
|
"file_names": ["Posts.txt"],
|
||||||
|
"save_status": 4
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,6 +5,8 @@ import platform
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import yaml
|
import yaml
|
||||||
|
import utils
|
||||||
|
import argparse
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||||
|
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
# Global Variables
|
|
||||||
|
|
||||||
driver = None
|
|
||||||
|
|
||||||
# whether to download photos or not
|
|
||||||
download_uploaded_photos = True
|
|
||||||
download_friends_photos = True
|
|
||||||
|
|
||||||
# whether to download the full image or its thumbnail (small size)
|
|
||||||
# if small size is True then it will be very quick else if its false then it will open each photo to download it
|
|
||||||
# and it will take much more time
|
|
||||||
friends_small_size = True
|
|
||||||
photos_small_size = True
|
|
||||||
|
|
||||||
total_scrolls = 2500
|
|
||||||
current_scrolls = 0
|
|
||||||
scroll_time = 8
|
|
||||||
|
|
||||||
with open("selectors.json") as json_file:
|
|
||||||
selectors = json.load(json_file)
|
|
||||||
|
|
||||||
old_height = 0
|
|
||||||
firefox_profile_path = selectors.get("firefox_profile_path")
|
|
||||||
facebook_https_prefix = selectors.get("facebook_https_prefix")
|
|
||||||
facebook_link_body = selectors.get("facebook_link_body")
|
|
||||||
|
|
||||||
|
|
||||||
CHROMEDRIVER_BINARIES_FOLDER = "bin"
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def get_facebook_images_url(img_links):
|
def get_facebook_images_url(img_links):
|
||||||
urls = []
|
urls = []
|
||||||
|
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
|
||||||
parent = os.getcwd()
|
parent = os.getcwd()
|
||||||
try:
|
try:
|
||||||
folder = os.path.join(os.getcwd(), folder_name)
|
folder = os.path.join(os.getcwd(), folder_name)
|
||||||
create_folder(folder)
|
utils.create_folder(folder)
|
||||||
os.chdir(folder)
|
os.chdir(folder)
|
||||||
except Exception:
|
except Exception:
|
||||||
print("Error in changing directory.")
|
print("Error in changing directory.")
|
||||||
|
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def check_height():
|
|
||||||
new_height = driver.execute_script(selectors.get("height_script"))
|
|
||||||
return new_height != old_height
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
|
|
||||||
# helper function: used to scroll the page
|
|
||||||
def scroll():
|
|
||||||
global old_height
|
|
||||||
current_scrolls = 0
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
if current_scrolls == total_scrolls:
|
|
||||||
return
|
|
||||||
|
|
||||||
old_height = driver.execute_script(selectors.get("height_script"))
|
|
||||||
driver.execute_script(selectors.get("scroll_script"))
|
|
||||||
WebDriverWait(driver, scroll_time, 0.05).until(
|
|
||||||
lambda driver: check_height()
|
|
||||||
)
|
|
||||||
current_scrolls += 1
|
|
||||||
except TimeoutException:
|
|
||||||
break
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
|
|
||||||
# --Helper Functions for Posts
|
|
||||||
|
|
||||||
|
|
||||||
def get_status(x):
|
|
||||||
status = ""
|
|
||||||
try:
|
|
||||||
status = x.find_element_by_xpath(
|
|
||||||
selectors.get("status")
|
|
||||||
).text # use _1xnd for Pages
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
status = x.find_element_by_xpath(selectors.get("status_exc")).text
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return status
|
|
||||||
|
|
||||||
|
|
||||||
def get_div_links(x, tag):
|
|
||||||
try:
|
|
||||||
temp = x.find_element_by_xpath(selectors.get("temp"))
|
|
||||||
return temp.find_element_by_tag_name(tag)
|
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_title_links(title):
|
|
||||||
l = title.find_elements_by_tag_name("a")
|
|
||||||
return l[-1].text, l[-1].get_attribute("href")
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(x):
|
|
||||||
title = ""
|
|
||||||
try:
|
|
||||||
title = x.find_element_by_xpath(selectors.get("title"))
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
title = x.find_element_by_xpath(selectors.get("title_exc1"))
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
title = x.find_element_by_xpath(selectors.get("title_exc2"))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
return title
|
|
||||||
|
|
||||||
|
|
||||||
def get_time(x):
|
|
||||||
time = ""
|
|
||||||
try:
|
|
||||||
time = x.find_element_by_tag_name("abbr").get_attribute("title")
|
|
||||||
time = (
|
|
||||||
str("%02d" % int(time.split(", ")[1].split()[1]),)
|
|
||||||
+ "-"
|
|
||||||
+ str(
|
|
||||||
(
|
|
||||||
"%02d"
|
|
||||||
% (
|
|
||||||
int(
|
|
||||||
(
|
|
||||||
list(calendar.month_abbr).index(
|
|
||||||
time.split(", ")[1].split()[0][:3]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
+ "-"
|
|
||||||
+ time.split()[3]
|
|
||||||
+ " "
|
|
||||||
+ str("%02d" % int(time.split()[5].split(":")[0]))
|
|
||||||
+ ":"
|
|
||||||
+ str(time.split()[5].split(":")[1])
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
finally:
|
|
||||||
return time
|
|
||||||
|
|
||||||
|
|
||||||
def extract_and_write_posts(elements, filename):
|
def extract_and_write_posts(elements, filename):
|
||||||
try:
|
try:
|
||||||
f = open(filename, "w", newline="\r\n")
|
f = open(filename, "w", newline="\r\n")
|
||||||
|
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
|
||||||
time = " "
|
time = " "
|
||||||
|
|
||||||
# time
|
# time
|
||||||
time = get_time(x)
|
time = utils.get_time(x)
|
||||||
|
|
||||||
# title
|
# title
|
||||||
title = get_title(x)
|
title = utils.get_title(x, selectors)
|
||||||
if title.text.find("shared a memory") != -1:
|
if title.text.find("shared a memory") != -1:
|
||||||
x = x.find_element_by_xpath(selectors.get("title_element"))
|
x = x.find_element_by_xpath(selectors.get("title_element"))
|
||||||
title = get_title(x)
|
title = utils.get_title(x, selectors)
|
||||||
|
|
||||||
status = get_status(x)
|
status = utils.get_status(x, selectors)
|
||||||
if (
|
if (
|
||||||
title.text
|
title.text
|
||||||
== driver.find_element_by_id(selectors.get("title_text")).text
|
== driver.find_element_by_id(selectors.get("title_text")).text
|
||||||
):
|
):
|
||||||
if status == "":
|
if status == "":
|
||||||
temp = get_div_links(x, "img")
|
temp = utils.get_div_links(x, "img", selectors)
|
||||||
if (
|
if (
|
||||||
temp == ""
|
temp == ""
|
||||||
): # no image tag which means . it is not a life event
|
): # no image tag which means . it is not a life event
|
||||||
link = get_div_links(x, "a").get_attribute("href")
|
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||||
|
"href"
|
||||||
|
)
|
||||||
type = "status update without text"
|
type = "status update without text"
|
||||||
else:
|
else:
|
||||||
type = "life event"
|
type = "life event"
|
||||||
link = get_div_links(x, "a").get_attribute("href")
|
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||||
status = get_div_links(x, "a").text
|
"href"
|
||||||
|
)
|
||||||
|
status = utils.get_div_links(x, "a", selectors).text
|
||||||
else:
|
else:
|
||||||
type = "status update"
|
type = "status update"
|
||||||
if get_div_links(x, "a") != "":
|
if utils.get_div_links(x, "a", selectors) != "":
|
||||||
link = get_div_links(x, "a").get_attribute("href")
|
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||||
|
"href"
|
||||||
|
)
|
||||||
|
|
||||||
elif title.text.find(" shared ") != -1:
|
elif title.text.find(" shared ") != -1:
|
||||||
|
|
||||||
x1, link = get_title_links(title)
|
x1, link = utils.get_title_links(title)
|
||||||
type = "shared " + x1
|
type = "shared " + x1
|
||||||
|
|
||||||
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
|
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
|
||||||
if title.text.find(" at ") != -1:
|
if title.text.find(" at ") != -1:
|
||||||
x1, link = get_title_links(title)
|
x1, link = utils.get_title_links(title)
|
||||||
type = "check in"
|
type = "check in"
|
||||||
elif title.text.find(" in ") != 1:
|
elif title.text.find(" in ") != 1:
|
||||||
status = get_div_links(x, "a").text
|
status = utils.get_div_links(x, "a", selectors).text
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
title.text.find(" added ") != -1 and title.text.find("photo") != -1
|
title.text.find(" added ") != -1 and title.text.find("photo") != -1
|
||||||
):
|
):
|
||||||
type = "added photo"
|
type = "added photo"
|
||||||
link = get_div_links(x, "a").get_attribute("href")
|
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
title.text.find(" added ") != -1 and title.text.find("video") != -1
|
title.text.find(" added ") != -1 and title.text.find("video") != -1
|
||||||
):
|
):
|
||||||
type = "added video"
|
type = "added video"
|
||||||
link = get_div_links(x, "a").get_attribute("href")
|
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
type = "others"
|
type = "others"
|
||||||
|
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
|
||||||
|
|
||||||
|
|
||||||
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
|
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
|
||||||
|
|
||||||
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
|
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
|
||||||
page = []
|
page = []
|
||||||
|
|
||||||
|
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if save_status != 3:
|
if save_status != 3:
|
||||||
scroll()
|
utils.scroll(total_scrolls, driver, selectors, scroll_time)
|
||||||
|
|
||||||
data = driver.find_elements_by_xpath(elements_path[i])
|
data = driver.find_elements_by_xpath(elements_path[i])
|
||||||
|
|
||||||
|
@ -619,16 +477,9 @@ def create_original_link(url):
|
||||||
return original_link
|
return original_link
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
def create_folder(folder):
|
|
||||||
if not os.path.exists(folder):
|
|
||||||
os.mkdir(folder)
|
|
||||||
|
|
||||||
|
|
||||||
def scrap_profile(ids):
|
def scrap_profile(ids):
|
||||||
folder = os.path.join(os.getcwd(), "data")
|
folder = os.path.join(os.getcwd(), "data")
|
||||||
create_folder(folder)
|
utils.create_folder(folder)
|
||||||
os.chdir(folder)
|
os.chdir(folder)
|
||||||
|
|
||||||
# execute for all profiles given in input.txt file
|
# execute for all profiles given in input.txt file
|
||||||
|
@ -642,137 +493,34 @@ def scrap_profile(ids):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
target_dir = os.path.join(folder, user_id.split("/")[-1])
|
target_dir = os.path.join(folder, user_id.split("/")[-1])
|
||||||
create_folder(target_dir)
|
utils.create_folder(target_dir)
|
||||||
os.chdir(target_dir)
|
os.chdir(target_dir)
|
||||||
except Exception:
|
except Exception:
|
||||||
print("Some error occurred in creating the profile directory.")
|
print("Some error occurred in creating the profile directory.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
|
||||||
|
for item in to_scrap:
|
||||||
print("----------------------------------------")
|
print("----------------------------------------")
|
||||||
print("Friends..")
|
print("Scraping {}..".format(item))
|
||||||
# setting parameters for scrape_data() to scrape friends
|
|
||||||
scan_list = [
|
|
||||||
"All",
|
|
||||||
"Mutual Friends",
|
|
||||||
"Following",
|
|
||||||
"Followers",
|
|
||||||
"Work",
|
|
||||||
"College",
|
|
||||||
"Current City",
|
|
||||||
"Hometown",
|
|
||||||
]
|
|
||||||
section = [
|
|
||||||
"/friends",
|
|
||||||
"/friends_mutual",
|
|
||||||
"/following",
|
|
||||||
"/followers",
|
|
||||||
"/friends_work",
|
|
||||||
"/friends_college",
|
|
||||||
"/friends_current_city",
|
|
||||||
"/friends_hometown",
|
|
||||||
]
|
|
||||||
elements_path = [
|
|
||||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
|
||||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
|
||||||
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
|
|
||||||
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
|
|
||||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
|
||||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
|
||||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
|
||||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
|
||||||
]
|
|
||||||
file_names = [
|
|
||||||
"All Friends.txt",
|
|
||||||
"Mutual Friends.txt",
|
|
||||||
"Following.txt",
|
|
||||||
"Followers.txt",
|
|
||||||
"Work Friends.txt",
|
|
||||||
"College Friends.txt",
|
|
||||||
"Current City Friends.txt",
|
|
||||||
"Hometown Friends.txt",
|
|
||||||
]
|
|
||||||
save_status = 0
|
|
||||||
|
|
||||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
if item == "Posts":
|
||||||
print("Friends Done!")
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
print("----------------------------------------")
|
|
||||||
print("Photos..")
|
|
||||||
print("Scraping Links..")
|
|
||||||
# setting parameters for scrape_data() to scrap photos
|
|
||||||
scan_list = ["'s Photos", "Photos of"]
|
|
||||||
section = ["/photos_all", "/photos_of"]
|
|
||||||
elements_path = ["//*[contains(@id, 'pic_')]"] * 2
|
|
||||||
file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
|
|
||||||
save_status = 1
|
|
||||||
|
|
||||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
|
||||||
print("Photos Done!")
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
print("----------------------------------------")
|
|
||||||
print("Videos:")
|
|
||||||
# setting parameters for scrape_data() to scrap videos
|
|
||||||
scan_list = ["'s Videos", "Videos of"]
|
|
||||||
section = ["/videos_by", "/videos_of"]
|
|
||||||
elements_path = [
|
|
||||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
|
|
||||||
] * 2
|
|
||||||
file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
|
|
||||||
save_status = 2
|
|
||||||
|
|
||||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
|
||||||
print("Videos Done!")
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
print("----------------------------------------")
|
|
||||||
print("About:")
|
|
||||||
# setting parameters for scrape_data() to scrap the about section
|
|
||||||
scan_list = [None] * 7
|
|
||||||
section = [
|
|
||||||
"/about?section=overview",
|
|
||||||
"/about?section=education",
|
|
||||||
"/about?section=living",
|
|
||||||
"/about?section=contact-info",
|
|
||||||
"/about?section=relationship",
|
|
||||||
"/about?section=bio",
|
|
||||||
"/about?section=year-overviews",
|
|
||||||
]
|
|
||||||
elements_path = [
|
|
||||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
|
|
||||||
] * 7
|
|
||||||
file_names = [
|
|
||||||
"Overview.txt",
|
|
||||||
"Work and Education.txt",
|
|
||||||
"Places Lived.txt",
|
|
||||||
"Contact and Basic Info.txt",
|
|
||||||
"Family and Relationships.txt",
|
|
||||||
"Details About.txt",
|
|
||||||
"Life Events.txt",
|
|
||||||
]
|
|
||||||
save_status = 3
|
|
||||||
|
|
||||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
|
||||||
print("About Section Done!")
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
print("----------------------------------------")
|
|
||||||
print("Posts:")
|
|
||||||
# setting parameters for scrape_data() to scrap posts
|
|
||||||
scan_list = [None]
|
scan_list = [None]
|
||||||
section = []
|
elif item == "About":
|
||||||
elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]']
|
scan_list = [None] * 7
|
||||||
|
else:
|
||||||
|
scan_list = params[item]["scan_list"]
|
||||||
|
|
||||||
file_names = ["Posts.txt"]
|
section = params[item]["section"]
|
||||||
save_status = 4
|
elements_path = params[item]["elements_path"]
|
||||||
|
file_names = params[item]["file_names"]
|
||||||
|
save_status = params[item]["save_status"]
|
||||||
|
|
||||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
scrape_data(
|
||||||
print("Posts(Statuses) Done!")
|
user_id, scan_list, section, elements_path, save_status, file_names
|
||||||
print("----------------------------------------")
|
)
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
print("{} Done!".format(item))
|
||||||
|
|
||||||
print("\nProcess Completed.")
|
print("\nProcess Completed.")
|
||||||
os.chdir("../..")
|
os.chdir("../..")
|
||||||
|
@ -901,5 +649,73 @@ def scraper(**kwargs):
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
# PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
|
||||||
|
ap.add_argument(
|
||||||
|
"-dup",
|
||||||
|
"--uploaded_photos",
|
||||||
|
help="download users' uploaded photos?",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-dfp", "--friends_photos", help="download users' photos?", default=True
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-fss",
|
||||||
|
"--friends_small_size",
|
||||||
|
help="Download friends pictures in small size?",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-pss",
|
||||||
|
"--photos_small_size",
|
||||||
|
help="Download photos in small size?",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-ts",
|
||||||
|
"--total_scrolls",
|
||||||
|
help="How many times should I scroll down?",
|
||||||
|
default=2500,
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-st", "--scroll_time", help="How much time should I take to scroll?", default=8
|
||||||
|
)
|
||||||
|
|
||||||
|
args = vars(ap.parse_args())
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# Global Variables
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
|
||||||
|
# whether to download photos or not
|
||||||
|
download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
|
||||||
|
download_friends_photos = utils.to_bool(args["friends_photos"])
|
||||||
|
|
||||||
|
# whether to download the full image or its thumbnail (small size)
|
||||||
|
# if small size is True then it will be very quick else if its false then it will open each photo to download it
|
||||||
|
# and it will take much more time
|
||||||
|
friends_small_size = utils.to_bool(args["friends_small_size"])
|
||||||
|
photos_small_size = utils.to_bool(args["photos_small_size"])
|
||||||
|
|
||||||
|
total_scrolls = int(args["total_scrolls"])
|
||||||
|
scroll_time = int(args["scroll_time"])
|
||||||
|
|
||||||
|
current_scrolls = 0
|
||||||
|
old_height = 0
|
||||||
|
|
||||||
|
driver = None
|
||||||
|
CHROMEDRIVER_BINARIES_FOLDER = "bin"
|
||||||
|
|
||||||
|
with open("selectors.json") as a, open("params.json") as b:
|
||||||
|
selectors = json.load(a)
|
||||||
|
params = json.load(b)
|
||||||
|
|
||||||
|
firefox_profile_path = selectors.get("firefox_profile_path")
|
||||||
|
facebook_https_prefix = selectors.get("facebook_https_prefix")
|
||||||
|
facebook_link_body = selectors.get("facebook_link_body")
|
||||||
|
|
||||||
# get things rolling
|
# get things rolling
|
||||||
scraper()
|
scraper()
|
||||||
|
|
139
scraper/utils.py
Normal file
139
scraper/utils.py
Normal file
|
@ -0,0 +1,139 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def to_bool(x):
|
||||||
|
if x in ["False", "0", 0, False]:
|
||||||
|
return False
|
||||||
|
elif x in ["True", "1", 1, True]:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
raise argparse.ArgumentTypeError("Boolean value expected")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def create_folder(folder):
|
||||||
|
if not os.path.exists(folder):
|
||||||
|
os.mkdir(folder)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
# Helper functions for Page scrolling
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
# check if height changed
|
||||||
|
def check_height(driver, selectors, old_height):
|
||||||
|
new_height = driver.execute_script(selectors.get("height_script"))
|
||||||
|
return new_height != old_height
|
||||||
|
|
||||||
|
|
||||||
|
# helper function: used to scroll the page
|
||||||
|
def scroll(total_scrolls, driver, selectors, scroll_time):
|
||||||
|
global old_height
|
||||||
|
current_scrolls = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
if current_scrolls == total_scrolls:
|
||||||
|
return
|
||||||
|
|
||||||
|
old_height = driver.execute_script(selectors.get("height_script"))
|
||||||
|
driver.execute_script(selectors.get("scroll_script"))
|
||||||
|
WebDriverWait(driver, scroll_time, 0.05).until(
|
||||||
|
lambda driver: check_height(driver, selectors, old_height)
|
||||||
|
)
|
||||||
|
current_scrolls += 1
|
||||||
|
except TimeoutException:
|
||||||
|
break
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper Functions for Posts
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def get_status(x, selectors):
|
||||||
|
status = ""
|
||||||
|
try:
|
||||||
|
status = x.find_element_by_xpath(
|
||||||
|
selectors.get("status")
|
||||||
|
).text # use _1xnd for Pages
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
status = x.find_element_by_xpath(selectors.get("status_exc")).text
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
def get_div_links(x, tag, selectors):
|
||||||
|
try:
|
||||||
|
temp = x.find_element_by_xpath(selectors.get("temp"))
|
||||||
|
return temp.find_element_by_tag_name(tag)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_title_links(title):
|
||||||
|
l = title.find_elements_by_tag_name("a")
|
||||||
|
return l[-1].text, l[-1].get_attribute("href")
|
||||||
|
|
||||||
|
|
||||||
|
def get_title(x, selectors):
|
||||||
|
title = ""
|
||||||
|
try:
|
||||||
|
title = x.find_element_by_xpath(selectors.get("title"))
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
title = x.find_element_by_xpath(selectors.get("title_exc1"))
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
title = x.find_element_by_xpath(selectors.get("title_exc2"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
return title
|
||||||
|
|
||||||
|
|
||||||
|
def get_time(x):
|
||||||
|
time = ""
|
||||||
|
try:
|
||||||
|
time = x.find_element_by_tag_name("abbr").get_attribute("title")
|
||||||
|
time = (
|
||||||
|
str("%02d" % int(time.split(", ")[1].split()[1]),)
|
||||||
|
+ "-"
|
||||||
|
+ str(
|
||||||
|
(
|
||||||
|
"%02d"
|
||||||
|
% (
|
||||||
|
int(
|
||||||
|
(
|
||||||
|
list(calendar.month_abbr).index(
|
||||||
|
time.split(", ")[1].split()[0][:3]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+ "-"
|
||||||
|
+ time.split()[3]
|
||||||
|
+ " "
|
||||||
|
+ str("%02d" % int(time.split()[5].split(":")[0]))
|
||||||
|
+ ":"
|
||||||
|
+ str(time.split()[5].split(":")[1])
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
finally:
|
||||||
|
return time
|
Loading…
Reference in a new issue