Initial commit

2025-02-18 21:28:29 +00:00 · 2018-08-23 02:11:20 +05:00 · 2018-08-23 02:11:20 +05:00 · a03c786286
commit a03c786286
7 changed files with 675 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 harismuneer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+# Facebook Scrapper
--- a/BIN
+++ b/BIN
--- a/chromedriver.exe
+++ b/chromedriver.exe
--- a/input.txt
+++ b/input.txt
--- a/scraper.py
+++ b/scraper.py
@ -0,0 +1,651 @@
+import calendar
+import os
+import platform
+import sys
+import urllib.request
+
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+
+# Global Variables
+
+driver = None
+
+# whether to download photos or not
+download_uploaded_photos = True
+download_friends_photos = True
+
+# whether to download the full image or its thumbnail (small size)
+# if small size is True then it will be very quick else if its false then it will open each photo to download it
+# and it will take much more time
+friends_small_size = True
+photos_small_size = True
+
+total_scrolls = 5000
+current_scrolls = 0
+scroll_time = 5
+
+old_height = 0
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+def get_facebook_images_url(img_links):
+    urls = []
+
+    for link in img_links:
+
+        if link != "None":
+            valid_url_found = False
+            driver.get(link)
+
+            try:
+                while not valid_url_found:
+                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "spotlight")))
+                    element = driver.find_element_by_class_name("spotlight")
+                    img_url = element.get_attribute('src')
+
+                    if img_url.find('.gif') == -1:
+                        valid_url_found = True
+                        urls.append(img_url)
+
+            except EC.StaleElementReferenceException:
+                urls.append(driver.find_element_by_class_name("spotlight").get_attribute('src'))
+
+            except:
+                print("Exception (facebook_image_downloader):", sys.exc_info()[0])
+
+        else:
+            urls.append("None")
+
+    return urls
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+# takes a url and downloads image from that url
+def image_downloader(img_links, folder_name):
+    img_names = []
+
+    try:
+        parent = os.getcwd()
+        try:
+            folder = os.path.join(os.getcwd(), folder_name)
+            if not os.path.exists(folder):
+                os.mkdir(folder)
+
+            os.chdir(folder)
+        except:
+            print("Error in changing directory")
+
+        for link in img_links:
+            img_name = "None"
+
+            if link != "None":
+                img_name = (link.split('.jpg')[0]).split('/')[-1] + '.jpg'
+
+                if img_name == "10354686_10150004552801856_220367501106153455_n.jpg":
+                    img_name = "None"
+                else:
+                    try:
+                        urllib.request.urlretrieve(link, img_name)
+                    except:
+                        img_name = "None"
+
+            img_names.append(img_name)
+
+        os.chdir(parent)
+    except:
+        print("Exception (image_downloader):", sys.exc_info()[0])
+
+    return img_names
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+def check_height():
+    new_height = driver.execute_script("return document.body.scrollHeight")
+    return new_height != old_height
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+# helper function: used to scroll the page
+def scroll():
+    global old_height
+    current_scrolls = 0
+
+    while (True):
+        try:
+            if current_scrolls == total_scrolls:
+                return
+
+            old_height = driver.execute_script("return document.body.scrollHeight")
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            WebDriverWait(driver, scroll_time, 0.05).until(lambda driver: check_height())
+            current_scrolls += 1
+        except TimeoutException:
+            break
+
+    return
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+# --Helper Functions for Posts
+
+def get_status(x):
+    status = ""
+    try:
+        status = x.find_element_by_xpath(".//div[@class='_5pbx userContent']").text
+    except:
+        try:
+            status = x.find_element_by_xpath(".//div[@class='userContent']").text
+        except:
+            pass
+    return status
+
+
+def get_div_links(x, tag):
+    try:
+        temp = x.find_element_by_xpath(".//div[@class='_3x-2']")
+        return temp.find_element_by_tag_name(tag)
+    except:
+        return ""
+
+
+def get_title_links(title):
+    l = title.find_elements_by_tag_name('a')
+    return l[-1].text, l[-1].get_attribute('href')
+
+
+def get_title(x):
+    title = ""
+    try:
+        title = x.find_element_by_xpath(".//span[@class='fwb fcg']")
+    except:
+        try:
+            title = x.find_element_by_xpath(".//span[@class='fcg']")
+        except:
+            try:
+                title = x.find_element_by_xpath(".//span[@class='fwn fcg']")
+            except:
+                pass
+    finally:
+        return title
+
+
+def get_time(x):
+    time = ""
+    try:
+        time = x.find_element_by_tag_name('abbr').get_attribute('title')
+        time = str("%02d" % int(time.split(", ")[1].split()[1]), ) + "-" + str(
+            ("%02d" % (int((list(calendar.month_abbr).index(time.split(", ")[1].split()[0][:3]))),))) + "-" + \
+               time.split()[3] + " " + str("%02d" % int(time.split()[5].split(":")[0])) + ":" + str(
+            time.split()[5].split(":")[1])
+    except:
+        pass
+
+    finally:
+        return time
+
+
+def extract_and_write_posts(elements, filename):
+    try:
+        f = open(filename, "w", newline='\r\n')
+        f.writelines(' TIME || TYPE  || TITLE || STATUS  ||   LINKS(Shared Posts/Shared Links etc) ' + '\n' + '\n')
+
+        for x in elements:
+            try:
+                video_link = " "
+                title = " "
+                status = " "
+                link = ""
+                img = " "
+                time = " "
+
+                # time
+                time = get_time(x)
+
+                # title
+                title = get_title(x)
+                if title.text.find("shared a memory") != -1:
+                    x = x.find_element_by_xpath(".//div[@class='_1dwg _1w_m']")
+                    title = get_title(x)
+
+                status = get_status(x)
+                if title.text == driver.find_element_by_id("fb-timeline-cover-name").text:
+                    if status == '':
+                        temp = get_div_links(x, "img")
+                        if temp == '':  # no image tag which means . it is not a life event
+                            link = get_div_links(x, "a").get_attribute('href')
+                            type = "status update without text"
+                        else:
+                            type = 'life event'
+                            link = get_div_links(x, "a").get_attribute('href')
+                            status = get_div_links(x, "a").text
+                    else:
+                        type = "status update"
+                        if get_div_links(x, "a") != '':
+                            link = get_div_links(x, "a").get_attribute('href')
+
+                elif title.text.find(" shared ") != -1:
+
+                    x1, link = get_title_links(title)
+                    type = "shared " + x1
+
+                elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
+                    if title.text.find(" at ") != -1:
+                        x1, link = get_title_links(title)
+                        type = "check in"
+                    elif title.text.find(" in ") != 1:
+                        status = get_div_links(x, "a").text
+
+                elif title.text.find(" added ") != -1 and title.text.find("photo") != -1:
+                    type = "added photo"
+                    link = get_div_links(x, "a").get_attribute('href')
+
+                elif title.text.find(" added ") != -1 and title.text.find("video") != -1:
+                    type = "added video"
+                    link = get_div_links(x, "a").get_attribute('href')
+
+                else:
+                    type = "others"
+
+                if not isinstance(title, str):
+                    title = title.text
+
+                status = status.replace("\n", " ")
+                title = title.replace("\n", " ")
+
+                line = str(time) + " || " + str(type) + ' || ' + str(title) + ' || ' + str(status) + ' || ' + str(
+                    link) + "\n"
+
+                try:
+                    f.writelines(line)
+                except:
+                    print('Posts: Could not map encoded characters')
+            except:
+                pass
+        f.close()
+    except:
+        print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0])
+
+    return
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+
+def save_to_file(name, elements, status, current_section):
+    """helper function used to save links to files"""
+
+    # status 0 = dealing with friends list
+    # status 1 = dealing with photos
+    # status 2 = dealing with videos
+    # status 3 = dealing with about section
+    # status 4 = dealing with posts
+
+    try:
+
+        f = None  # file pointer
+
+        if status != 4:
+            f = open(name, 'w', encoding='utf-8', newline='\r\n')
+
+        results = []
+        img_names = []
+
+        # dealing with Friends
+        if status == 0:
+
+            results = [x.get_attribute('href') for x in elements]
+            results = [create_original_link(x) for x in results]
+
+            try:
+                if download_friends_photos:
+
+                    if friends_small_size:
+                        img_links = [x.find_element_by_css_selector('img').get_attribute('src') for x in elements]
+                    else:
+                        links = []
+                        for friend in results:
+                            driver.get(friend)
+                            WebDriverWait(driver, 10).until(
+                                EC.presence_of_element_located((By.CLASS_NAME, "profilePicThumb")))
+                            l = driver.find_element_by_class_name("profilePicThumb").get_attribute('href')
+                            links.append(l)
+
+                        for i in range(len(links)):
+                            if links[i].find('picture/view') != -1:
+                                links[i] = "None"
+
+                        img_links = get_facebook_images_url(links)
+
+                    folder_names = ["Friend's Photos", "Following's Photos", "Follower's Photos", "Work Friends Photos",
+                                    "College Friends Photos", "Current City Friends Photos", "Hometown Friends Photos"]
+                    print("Downloading " + folder_names[current_section])
+
+                    img_names = image_downloader(img_links, folder_names[current_section])
+            except:
+                print("Exception (Images)", str(status), "Status =", current_section, sys.exc_info()[0])
+
+        # dealing with Photos
+        elif status == 1:
+            results = [x.get_attribute('href') for x in elements]
+            results.pop(0)
+
+            try:
+                if download_uploaded_photos:
+                    if photos_small_size:
+                        background_img_links = driver.find_elements_by_xpath("//*[contains(@id, 'pic_')]/div/i")
+                        background_img_links = [x.get_attribute('style') for x in background_img_links]
+                        background_img_links = [((x.split('(')[1]).split(')')[0]).strip('"') for x in
+                                                background_img_links]
+                    else:
+                        background_img_links = get_facebook_images_url(results)
+
+                    folder_names = ["Uploaded Photos", "Tagged Photos"]
+                    print("Downloading " + folder_names[current_section])
+
+                    img_names = image_downloader(background_img_links, folder_names[current_section])
+            except:
+                print("Exception (Images)", str(status), "Status =", current_section, sys.exc_info()[0])
+
+        # dealing with Videos
+        elif status == 2:
+            results = elements[0].find_elements_by_css_selector('li')
+            results = [x.find_element_by_css_selector('a').get_attribute('href') for x in results]
+
+            try:
+                if results[0][0] == '/':
+                    results = [r.pop(0) for r in results]
+                    results = [("https://en-gb.facebook.com/" + x) for x in results]
+            except:
+                pass
+
+        # dealing with About Section
+        elif status == 3:
+            results = elements[0].text
+            f.writelines(results)
+
+        # dealing with Posts
+        elif status == 4:
+            extract_and_write_posts(elements, name)
+            return
+
+        if (status == 0) or (status == 1):
+            for i in range(len(results)):
+                f.writelines(results[i])
+                f.write(',')
+                try:
+                    f.writelines(img_names[i])
+                except:
+                    f.writelines("None")
+                f.write('\n')
+
+        elif status == 2:
+            for x in results:
+                f.writelines(x + "\n")
+
+        f.close()
+
+    except:
+        print("Exception (save_to_file)", "Status =", str(status), sys.exc_info()[0])
+
+    return
+
+
+# ----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
+
+def scrap_data(id, scan_list, section, elements_path, save_status, file_names):
+    """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
+
+    page = []
+
+    if save_status == 4:
+        page.append(id)
+
+    for i in range(len(section)):
+        page.append(id + section[i])
+
+    for i in range(len(scan_list)):
+        try:
+            driver.get(page[i])
+
+            if (save_status == 0) or (save_status == 1) or (
+                    save_status == 2):  # Only run this for friends, photos and videos
+
+                # the bar which contains all the sections
+                sections_bar = driver.find_element_by_xpath("//*[@class='_3cz'][1]/div[2]/div[1]")
+
+                if sections_bar.text.find(scan_list[i]) == -1:
+                    continue
+
+            if save_status != 3:
+                scroll()
+
+            data = driver.find_elements_by_xpath(elements_path[i])
+
+            save_to_file(file_names[i], data, save_status, i)
+
+        except:
+            print("Exception (scrap_data)", str(i), "Status =", str(save_status), sys.exc_info()[0])
+
+
+# -----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
+
+def create_original_link(url):
+    if url.find(".php") != -1:
+        original_link = "https://en-gb.facebook.com/" + ((url.split("="))[1])
+
+        if original_link.find("&") != -1:
+            original_link = original_link.split("&")[0]
+
+    elif url.find("fnr_t") != -1:
+        original_link = "https://en-gb.facebook.com/" + ((url.split("/"))[-1].split("?")[0])
+    elif url.find("_tab") != -1:
+        original_link = "https://en-gb.facebook.com/" + (url.split("?")[0]).split("/")[-1]
+    else:
+        original_link = url
+
+    return original_link
+
+
+# -----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
+
+def scrap_profile(ids):
+    folder = os.path.join(os.getcwd(), "Data")
+
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+
+    os.chdir(folder)
+
+    # execute for all profiles given in input.txt file
+    for id in ids:
+
+        driver.get(id)
+        url = driver.current_url
+        id = create_original_link(url)
+
+        print("\nScraping:", id)
+
+        try:
+            if not os.path.exists(os.path.join(folder, id.split('/')[-1])):
+                os.mkdir(os.path.join(folder, id.split('/')[-1]))
+            else:
+                print("A folder with the same profile name already exists."
+                      " Kindly remove that folder first and then run this code.")
+                continue
+            os.chdir(os.path.join(folder, id.split('/')[-1]))
+        except:
+            print("Some error occurred in creating the profile directory.")
+            continue
+
+        # ----------------------------------------------------------------------------
+        print("----------------------------------------")
+        print("Friends..")
+        # setting parameters for scrap_data() to scrap friends
+        scan_list = ["All", "Following", "Followers", "Work", "College", "Current City", "Hometown"]
+        section = ["/friends", "/following", "/followers", "/friends_work", "/friends_college", "/friends_current_city",
+                   "/friends_hometown"]
+        elements_path = ["//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
+                         "//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
+                         "//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
+                         "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
+                         "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
+                         "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
+                         "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"]
+        file_names = ["All Friends.txt", "Following.txt", "Followers.txt", "Work Friends.txt", "College Friends.txt",
+                      "Current City Friends.txt", "Hometown Friends.txt"]
+        save_status = 0
+
+        scrap_data(id, scan_list, section, elements_path, save_status, file_names)
+        print("Friends Done")
+        # ----------------------------------------------------------------------------
+
+        print("----------------------------------------")
+        print("Photos..")
+        print("Scraping Links..")
+        # setting parameters for scrap_data() to scrap photos
+        scan_list = ["'s Photos", "Photos of"]
+        section = ["/photos_all", "/photos_of"]
+        elements_path = ["//*[contains(@id, 'pic_')]"] * 2
+        file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
+        save_status = 1
+
+        scrap_data(id, scan_list, section, elements_path, save_status, file_names)
+        print("Photos Done")
+
+        # ----------------------------------------------------------------------------
+
+        print("----------------------------------------")
+        print("Videos:")
+        # setting parameters for scrap_data() to scrap videos
+        scan_list = ["'s Videos", "Videos of"]
+        section = ["/videos_by", "/videos_of"]
+        elements_path = ["//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"] * 2
+        file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
+        save_status = 2
+
+        scrap_data(id, scan_list, section, elements_path, save_status, file_names)
+        print("Videos Done")
+        # ----------------------------------------------------------------------------
+
+        print("----------------------------------------")
+        print("About:")
+        # setting parameters for scrap_data() to scrap the about section
+        scan_list = [None] * 7
+        section = ["/about?section=overview", "/about?section=education", "/about?section=living",
+                   "/about?section=contact-info", "/about?section=relationship", "/about?section=bio",
+                   "/about?section=year-overviews"]
+        elements_path = ["//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"] * 7
+        file_names = ["Overview.txt", "Work and Education.txt", "Places Lived.txt", "Contact and Basic Info.txt",
+                      "Family and Relationships.txt", "Details About.txt", "Life Events.txt"]
+        save_status = 3
+
+        scrap_data(id, scan_list, section, elements_path, save_status, file_names)
+        print("About Section Done")
+
+        # ----------------------------------------------------------------------------
+        print("----------------------------------------")
+        print("Posts:")
+        # setting parameters for scrap_data() to scrap posts
+        scan_list = [None]
+        section = []
+        elements_path = ["//div[@class='_4-u2 mbm _4mrt _5jmm _5pat _5v3q _4-u8']"]
+
+        file_names = ["Posts.txt"]
+        save_status = 4
+
+        scrap_data(id, scan_list, section, elements_path, save_status, file_names)
+        print("Posts(Statuses) Done")
+        print("----------------------------------------")
+    # ----------------------------------------------------------------------------
+
+    print("\nProcess Completed.")
+
+    return
+
+
+# -----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
+
+def login(email, password):
+    """ Logging into our own profile """
+
+    try:
+        global driver
+
+        options = Options()
+
+        #  Code to disable notifications pop up of Chrome Browser
+        options.add_argument("--disable-notifications")
+        # options.add_argument("headless")
+
+        # initialization
+        if platform.system() != "Windows":
+            driver = webdriver.Chrome(executable_path=os.getcwd() + "/chromedriver", chrome_options=options)
+        else:
+            driver = webdriver.Chrome(executable_path=os.getcwd() + "/chromedriver.exe", chrome_options=options)
+
+        driver.get("https://en-gb.facebook.com")
+        driver.maximize_window()
+
+        # filling the form
+        driver.find_element_by_name('email').send_keys(email)
+        driver.find_element_by_name('pass').send_keys(password)
+
+        # clicking on login button
+        driver.find_element_by_id('loginbutton').click()
+
+    except:
+        print("There's some error in log in.")
+        exit()
+
+
+# -----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
+
+def main():
+    ids = ["https://en-gb.facebook.com/" + line.split("/")[-1] for line in open("input.txt", newline='\n')]
+
+    if len(ids) > 0:
+        # Getting email and password from user to login into his/her profile
+        email = input('\nEnter your Facebook Email: ')
+        password = input('Enter your Facebook Password: ')
+
+        print("\nStarting Scraping...")
+
+        login(email, password)
+        scrap_profile(ids)
+        driver.close()
+    else:
+        print("Input file is empty..")
+
+
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+# -------------------------------------------------------------
+
+
+# get things rolling
+main()