parameters collection for scrapping automated & tested \ command line arguments added & tested \ some enhancements to 2 functions \ utils.py addded, code refactored & tested \ updated readme

2025-02-18 21:28:29 +00:00 · 2020-03-28 17:24:26 +01:00 · 2020-03-28 17:24:26 +01:00 · 43d626514e
commit 43d626514e
parent e64ed73afb
3 changed files with 374 additions and 298 deletions
--- a/params.json
+++ b/params.json
@ -0,0 +1,121 @@
 {
 	"Friends": {
 		"scan_list":  [
 			"All", 
 			"Mutual Friends", 
 			"Following", 
 			"Followers", 
 			"Work", 
 			"College", 
 			"Current City", 
 			"Hometown"
 		],
 		"section": [
            "/friends",
            "/friends_mutual",
            "/following",
            "/followers",
            "/friends_work",
            "/friends_college",
            "/friends_current_city",
            "/friends_hometown"
        ],
 		"elements_path": [
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
            "//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
        ],
 		"file_names": [
            "All Friends.txt",
            "Mutual Friends.txt",
            "Following.txt",
            "Followers.txt",
            "Work Friends.txt",
            "College Friends.txt",
            "Current City Friends.txt",
            "Hometown Friends.txt"
        ],
 		"save_status":0
 	},
 	"Photos": {
 		"scan_list": [
 			"'s Photos",
 			"Photos of"
 		],
        "section": [
        	"/photos_all", 
        	"/photos_of"
        ],
        "elements_path": [
        	"//*[contains(@id, 'pic_')]",
        	"//*[contains(@id, 'pic_')]"
        ],
        "file_names": [
        	"Uploaded Photos.txt", 
        	"Tagged Photos.txt"
        ],
        "save_status": 1
    },	
    "Videos": {
    	"scan_list": [
    	 	"'s Videos", 
    	 	"Videos of"
    	],
        "section": [
        	"/videos_by", 
        	"/videos_of"
        ],
        "elements_path": [
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
        ],
        "file_names": [
        	"Uploaded Videos.txt", 
        	"Tagged Videos.txt"
        ],
        "save_status": 2
    },
    "About": {
    	"scan_list": [],
        "section": [
            "/about?section=overview",
            "/about?section=education",
            "/about?section=living",
            "/about?section=contact-info",
            "/about?section=relationship",
            "/about?section=bio",
            "/about?section=year-overviews"
        ],
        "elements_path": [
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
        ],
        "file_names": [
            "Overview.txt",
            "Work and Education.txt",
            "Places Lived.txt",
            "Contact and Basic Info.txt",
            "Family and Relationships.txt",
            "Details About.txt",
            "Life Events.txt"
        ],
        "save_status": 3
    },
    "Posts": {
    	"scan_list": [],
        "section": [],
        "elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
        "file_names": ["Posts.txt"],
        "save_status": 4
    }
 }
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@ -5,6 +5,8 @@ import platform
 import sys
 import urllib.request
 import yaml
 import utils
 import argparse
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 # -------------------------------------------------------------
 # -------------------------------------------------------------
 # Global Variables
 driver = None
 # whether to download photos or not
 download_uploaded_photos = True
 download_friends_photos = True
 # whether to download the full image or its thumbnail (small size)
 # if small size is True then it will be very quick else if its false then it will open each photo to download it
 # and it will take much more time
 friends_small_size = True
 photos_small_size = True
 total_scrolls = 2500
 current_scrolls = 0
 scroll_time = 8
 with open("selectors.json") as json_file:
    selectors = json.load(json_file)
 old_height = 0
 firefox_profile_path = selectors.get("firefox_profile_path")
 facebook_https_prefix = selectors.get("facebook_https_prefix")
 facebook_link_body = selectors.get("facebook_link_body")
 CHROMEDRIVER_BINARIES_FOLDER = "bin"
 # -------------------------------------------------------------
 # -------------------------------------------------------------
 def get_facebook_images_url(img_links):
    urls = []
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
        parent = os.getcwd()
        try:
            folder = os.path.join(os.getcwd(), folder_name)
-            create_folder(folder)
+            utils.create_folder(folder)
            os.chdir(folder)
        except Exception:
            print("Error in changing directory.")
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
 # -------------------------------------------------------------
 def check_height():
    new_height = driver.execute_script(selectors.get("height_script"))
    return new_height != old_height
 # -------------------------------------------------------------
 # -------------------------------------------------------------
 # helper function: used to scroll the page
 def scroll():
    global old_height
    current_scrolls = 0
    while True:
        try:
            if current_scrolls == total_scrolls:
                return
            old_height = driver.execute_script(selectors.get("height_script"))
            driver.execute_script(selectors.get("scroll_script"))
            WebDriverWait(driver, scroll_time, 0.05).until(
                lambda driver: check_height()
            )
            current_scrolls += 1
        except TimeoutException:
            break
    return
 # -------------------------------------------------------------
 # -------------------------------------------------------------
 # --Helper Functions for Posts
 def get_status(x):
    status = ""
    try:
        status = x.find_element_by_xpath(
            selectors.get("status")
        ).text  # use _1xnd for Pages
    except Exception:
        try:
            status = x.find_element_by_xpath(selectors.get("status_exc")).text
        except Exception:
            pass
    return status
 def get_div_links(x, tag):
    try:
        temp = x.find_element_by_xpath(selectors.get("temp"))
        return temp.find_element_by_tag_name(tag)
    except Exception:
        return ""
 def get_title_links(title):
    l = title.find_elements_by_tag_name("a")
    return l[-1].text, l[-1].get_attribute("href")
 def get_title(x):
    title = ""
    try:
        title = x.find_element_by_xpath(selectors.get("title"))
    except Exception:
        try:
            title = x.find_element_by_xpath(selectors.get("title_exc1"))
        except Exception:
            try:
                title = x.find_element_by_xpath(selectors.get("title_exc2"))
            except Exception:
                pass
    finally:
        return title
 def get_time(x):
    time = ""
    try:
        time = x.find_element_by_tag_name("abbr").get_attribute("title")
        time = (
            str("%02d" % int(time.split(", ")[1].split()[1]),)
            + "-"
            + str(
                (
                    "%02d"
                    % (
                        int(
                            (
                                list(calendar.month_abbr).index(
                                    time.split(", ")[1].split()[0][:3]
                                )
                            )
                        ),
                    )
                )
            )
            + "-"
            + time.split()[3]
            + " "
            + str("%02d" % int(time.split()[5].split(":")[0]))
            + ":"
            + str(time.split()[5].split(":")[1])
        )
    except Exception:
        pass
    finally:
        return time
 def extract_and_write_posts(elements, filename):
    try:
        f = open(filename, "w", newline="\r\n")
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
                time = " "
                # time
-                time = get_time(x)
+                time = utils.get_time(x)
                # title
-                title = get_title(x)
+                title = utils.get_title(x, selectors)
                if title.text.find("shared a memory") != -1:
                    x = x.find_element_by_xpath(selectors.get("title_element"))
-                    title = get_title(x)
+                    title = utils.get_title(x, selectors)
-                status = get_status(x)
+                status = utils.get_status(x, selectors)
                if (
                    title.text
                    == driver.find_element_by_id(selectors.get("title_text")).text
                ):
                    if status == "":
-                        temp = get_div_links(x, "img")
+                        temp = utils.get_div_links(x, "img", selectors)
                        if (
                            temp == ""
                        ):  # no image tag which means . it is not a life event
-                            link = get_div_links(x, "a").get_attribute("href")
+                            link = utils.get_div_links(x, "a", selectors).get_attribute(
                                "href"
                            )
                            type = "status update without text"
                        else:
                            type = "life event"
-                            link = get_div_links(x, "a").get_attribute("href")
+                            link = utils.get_div_links(x, "a", selectors).get_attribute(
-                            status = get_div_links(x, "a").text
+                                "href"
                            )
                            status = utils.get_div_links(x, "a", selectors).text
                    else:
                        type = "status update"
-                        if get_div_links(x, "a") != "":
+                        if utils.get_div_links(x, "a", selectors) != "":
-                            link = get_div_links(x, "a").get_attribute("href")
+                            link = utils.get_div_links(x, "a", selectors).get_attribute(
                                "href"
                            )
                elif title.text.find(" shared ") != -1:
-                    x1, link = get_title_links(title)
+                    x1, link = utils.get_title_links(title)
                    type = "shared " + x1
                elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
                    if title.text.find(" at ") != -1:
-                        x1, link = get_title_links(title)
+                        x1, link = utils.get_title_links(title)
                        type = "check in"
                    elif title.text.find(" in ") != 1:
-                        status = get_div_links(x, "a").text
+                        status = utils.get_div_links(x, "a", selectors).text
                elif (
                    title.text.find(" added ") != -1 and title.text.find("photo") != -1
                ):
                    type = "added photo"
-                    link = get_div_links(x, "a").get_attribute("href")
+                    link = utils.get_div_links(x, "a", selectors).get_attribute("href")
                elif (
                    title.text.find(" added ") != -1 and title.text.find("video") != -1
                ):
                    type = "added video"
-                    link = get_div_links(x, "a").get_attribute("href")
+                    link = utils.get_div_links(x, "a", selectors).get_attribute("href")
                else:
                    type = "others"
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
 def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
    """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
    page = []
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
                    continue
            if save_status != 3:
-                scroll()
+                utils.scroll(total_scrolls, driver, selectors, scroll_time)
            data = driver.find_elements_by_xpath(elements_path[i])
@ -619,16 +477,9 @@ def create_original_link(url):
    return original_link
 # -----------------------------------------------------------------------------
 # -----------------------------------------------------------------------------
 def create_folder(folder):
    if not os.path.exists(folder):
        os.mkdir(folder)
 def scrap_profile(ids):
    folder = os.path.join(os.getcwd(), "data")
-    create_folder(folder)
+    utils.create_folder(folder)
    os.chdir(folder)
    # execute for all profiles given in input.txt file
@ -642,137 +493,34 @@ def scrap_profile(ids):
        try:
            target_dir = os.path.join(folder, user_id.split("/")[-1])
-            create_folder(target_dir)
+            utils.create_folder(target_dir)
            os.chdir(target_dir)
        except Exception:
            print("Some error occurred in creating the profile directory.")
            continue
-        # ----------------------------------------------------------------------------
+        to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
        for item in to_scrap:
            print("----------------------------------------")
-        print("Friends..")
+            print("Scraping {}..".format(item))
        # setting parameters for scrape_data() to scrape friends
        scan_list = [
            "All",
            "Mutual Friends",
            "Following",
            "Followers",
            "Work",
            "College",
            "Current City",
            "Hometown",
        ]
        section = [
            "/friends",
            "/friends_mutual",
            "/following",
            "/followers",
            "/friends_work",
            "/friends_college",
            "/friends_current_city",
            "/friends_hometown",
        ]
        elements_path = [
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
            "//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
            "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
        ]
        file_names = [
            "All Friends.txt",
            "Mutual Friends.txt",
            "Following.txt",
            "Followers.txt",
            "Work Friends.txt",
            "College Friends.txt",
            "Current City Friends.txt",
            "Hometown Friends.txt",
        ]
        save_status = 0
-        scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
+            if item == "Posts":
        print("Friends Done!")
        # ----------------------------------------------------------------------------
        print("----------------------------------------")
        print("Photos..")
        print("Scraping Links..")
        # setting parameters for scrape_data() to scrap photos
        scan_list = ["'s Photos", "Photos of"]
        section = ["/photos_all", "/photos_of"]
        elements_path = ["//*[contains(@id, 'pic_')]"] * 2
        file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
        save_status = 1
        scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
        print("Photos Done!")
        # ----------------------------------------------------------------------------
        print("----------------------------------------")
        print("Videos:")
        # setting parameters for scrape_data() to scrap videos
        scan_list = ["'s Videos", "Videos of"]
        section = ["/videos_by", "/videos_of"]
        elements_path = [
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
        ] * 2
        file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
        save_status = 2
        scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
        print("Videos Done!")
        # ----------------------------------------------------------------------------
        print("----------------------------------------")
        print("About:")
        # setting parameters for scrape_data() to scrap the about section
        scan_list = [None] * 7
        section = [
            "/about?section=overview",
            "/about?section=education",
            "/about?section=living",
            "/about?section=contact-info",
            "/about?section=relationship",
            "/about?section=bio",
            "/about?section=year-overviews",
        ]
        elements_path = [
            "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
        ] * 7
        file_names = [
            "Overview.txt",
            "Work and Education.txt",
            "Places Lived.txt",
            "Contact and Basic Info.txt",
            "Family and Relationships.txt",
            "Details About.txt",
            "Life Events.txt",
        ]
        save_status = 3
        scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
        print("About Section Done!")
        # ----------------------------------------------------------------------------
        print("----------------------------------------")
        print("Posts:")
        # setting parameters for scrape_data() to scrap posts
                scan_list = [None]
-        section = []
+            elif item == "About":
-        elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]']
+                scan_list = [None] * 7
            else:
                scan_list = params[item]["scan_list"]
-        file_names = ["Posts.txt"]
+            section = params[item]["section"]
-        save_status = 4
+            elements_path = params[item]["elements_path"]
            file_names = params[item]["file_names"]
            save_status = params[item]["save_status"]
-        scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
+            scrape_data(
-        print("Posts(Statuses) Done!")
+                user_id, scan_list, section, elements_path, save_status, file_names
-        print("----------------------------------------")
+            )
-    # ----------------------------------------------------------------------------
+
            print("{} Done!".format(item))
    print("\nProcess Completed.")
    os.chdir("../..")
@ -901,5 +649,73 @@ def scraper(**kwargs):
 # -------------------------------------------------------------
 if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    # PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
    ap.add_argument(
        "-dup",
        "--uploaded_photos",
        help="download users' uploaded photos?",
        default=True,
    )
    ap.add_argument(
        "-dfp", "--friends_photos", help="download users' photos?", default=True
    )
    ap.add_argument(
        "-fss",
        "--friends_small_size",
        help="Download friends pictures in small size?",
        default=True,
    )
    ap.add_argument(
        "-pss",
        "--photos_small_size",
        help="Download photos in small size?",
        default=True,
    )
    ap.add_argument(
        "-ts",
        "--total_scrolls",
        help="How many times should I scroll down?",
        default=2500,
    )
    ap.add_argument(
        "-st", "--scroll_time", help="How much time should I take to scroll?", default=8
    )
    args = vars(ap.parse_args())
    print(args)
    # ---------------------------------------------------------
    # Global Variables
    # ---------------------------------------------------------
    # whether to download photos or not
    download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
    download_friends_photos = utils.to_bool(args["friends_photos"])
    # whether to download the full image or its thumbnail (small size)
    # if small size is True then it will be very quick else if its false then it will open each photo to download it
    # and it will take much more time
    friends_small_size = utils.to_bool(args["friends_small_size"])
    photos_small_size = utils.to_bool(args["photos_small_size"])
    total_scrolls = int(args["total_scrolls"])
    scroll_time = int(args["scroll_time"])
    current_scrolls = 0
    old_height = 0
    driver = None
    CHROMEDRIVER_BINARIES_FOLDER = "bin"
    with open("selectors.json") as a, open("params.json") as b:
        selectors = json.load(a)
        params = json.load(b)
    firefox_profile_path = selectors.get("firefox_profile_path")
    facebook_https_prefix = selectors.get("facebook_https_prefix")
    facebook_link_body = selectors.get("facebook_link_body")
    # get things rolling
    scraper()
--- a/scraper/utils.py
+++ b/scraper/utils.py
@ -0,0 +1,139 @@
 import argparse
 import os
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.support.ui import WebDriverWait
 # -----------------------------------------------------------------------------
 #
 # -----------------------------------------------------------------------------
 def to_bool(x):
    if x in ["False", "0", 0, False]:
        return False
    elif x in ["True", "1", 1, True]:
        return True
    else:
        raise argparse.ArgumentTypeError("Boolean value expected")
 # -----------------------------------------------------------------------------
 #
 # -----------------------------------------------------------------------------
 def create_folder(folder):
    if not os.path.exists(folder):
        os.mkdir(folder)
 # -------------------------------------------------------------
 # Helper functions for Page scrolling
 # -------------------------------------------------------------
 # check if height changed
 def check_height(driver, selectors, old_height):
    new_height = driver.execute_script(selectors.get("height_script"))
    return new_height != old_height
 # helper function: used to scroll the page
 def scroll(total_scrolls, driver, selectors, scroll_time):
    global old_height
    current_scrolls = 0
    while True:
        try:
            if current_scrolls == total_scrolls:
                return
            old_height = driver.execute_script(selectors.get("height_script"))
            driver.execute_script(selectors.get("scroll_script"))
            WebDriverWait(driver, scroll_time, 0.05).until(
                lambda driver: check_height(driver, selectors, old_height)
            )
            current_scrolls += 1
        except TimeoutException:
            break
    return
 # -----------------------------------------------------------------------------
 # Helper Functions for Posts
 # -----------------------------------------------------------------------------
 def get_status(x, selectors):
    status = ""
    try:
        status = x.find_element_by_xpath(
            selectors.get("status")
        ).text  # use _1xnd for Pages
    except Exception:
        try:
            status = x.find_element_by_xpath(selectors.get("status_exc")).text
        except Exception:
            pass
    return status
 def get_div_links(x, tag, selectors):
    try:
        temp = x.find_element_by_xpath(selectors.get("temp"))
        return temp.find_element_by_tag_name(tag)
    except Exception:
        return ""
 def get_title_links(title):
    l = title.find_elements_by_tag_name("a")
    return l[-1].text, l[-1].get_attribute("href")
 def get_title(x, selectors):
    title = ""
    try:
        title = x.find_element_by_xpath(selectors.get("title"))
    except Exception:
        try:
            title = x.find_element_by_xpath(selectors.get("title_exc1"))
        except Exception:
            try:
                title = x.find_element_by_xpath(selectors.get("title_exc2"))
            except Exception:
                pass
    finally:
        return title
 def get_time(x):
    time = ""
    try:
        time = x.find_element_by_tag_name("abbr").get_attribute("title")
        time = (
            str("%02d" % int(time.split(", ")[1].split()[1]),)
            + "-"
            + str(
                (
                    "%02d"
                    % (
                        int(
                            (
                                list(calendar.month_abbr).index(
                                    time.split(", ")[1].split()[0][:3]
                                )
                            )
                        ),
                    )
                )
            )
            + "-"
            + time.split()[3]
            + " "
            + str("%02d" % int(time.split()[5].split(":")[0]))
            + ":"
            + str(time.split()[5].split(":")[1])
        )
    except Exception:
        pass
    finally:
        return time