2017-10-18 22:38:17 +00:00
import os
from functools import wraps
2018-04-25 07:49:26 +00:00
from collections import defaultdict
2017-10-18 22:38:17 +00:00
from datetime import datetime
2019-02-21 22:45:28 +00:00
from index import (
parse_json_link_index ,
write_link_index ,
patch_index_title_hack ,
)
2017-10-18 22:38:17 +00:00
from config import (
2019-02-21 20:47:15 +00:00
CURL_BINARY ,
GIT_BINARY ,
WGET_BINARY ,
YOUTUBEDL_BINARY ,
2017-10-18 22:38:17 +00:00
CHROME_BINARY ,
2019-02-19 06:44:54 +00:00
FETCH_FAVICON ,
FETCH_TITLE ,
2017-10-18 22:38:17 +00:00
FETCH_WGET ,
FETCH_WGET_REQUISITES ,
FETCH_PDF ,
FETCH_SCREENSHOT ,
2018-06-10 22:45:41 +00:00
FETCH_DOM ,
2019-01-11 12:02:49 +00:00
FETCH_WARC ,
2019-01-11 10:18:49 +00:00
FETCH_GIT ,
2019-01-11 10:52:29 +00:00
FETCH_MEDIA ,
2017-10-18 22:38:17 +00:00
RESOLUTION ,
2018-01-09 00:43:25 +00:00
CHECK_SSL_VALIDITY ,
2017-10-18 22:38:17 +00:00
SUBMIT_ARCHIVE_DOT_ORG ,
2019-02-21 17:57:16 +00:00
COOKIES_FILE ,
2017-10-18 22:38:17 +00:00
WGET_USER_AGENT ,
2017-10-30 09:01:59 +00:00
CHROME_USER_DATA_DIR ,
2019-03-12 21:50:10 +00:00
CHROME_HEADLESS ,
2018-10-14 02:12:26 +00:00
CHROME_SANDBOX ,
2017-10-18 22:38:17 +00:00
TIMEOUT ,
2019-01-11 11:33:35 +00:00
MEDIA_TIMEOUT ,
2017-10-18 22:38:17 +00:00
ANSI ,
2018-06-11 01:26:11 +00:00
ARCHIVE_DIR ,
2019-01-11 10:27:25 +00:00
GIT_DOMAINS ,
2019-02-05 04:02:33 +00:00
GIT_SHA ,
2017-10-18 22:38:17 +00:00
)
from util import (
2019-02-27 20:42:49 +00:00
domain ,
2019-02-27 20:55:39 +00:00
without_query ,
2019-02-21 21:03:19 +00:00
without_fragment ,
2019-02-19 06:44:54 +00:00
fetch_page_title ,
2017-10-18 22:38:17 +00:00
progress ,
chmod_file ,
2018-06-11 00:52:15 +00:00
pretty_path ,
2019-03-08 21:25:15 +00:00
print_error_hints ,
2019-02-21 22:45:28 +00:00
check_link_structure ,
2019-03-08 21:25:15 +00:00
wget_output_path ,
2019-02-21 22:45:28 +00:00
run , PIPE , DEVNULL ,
2017-10-18 22:38:17 +00:00
)
2017-10-23 09:57:34 +00:00
_RESULTS_TOTALS = { # globals are bad, mmkay
2017-10-18 22:38:17 +00:00
' skipped ' : 0 ,
' succeded ' : 0 ,
' failed ' : 0 ,
}
2019-03-08 21:25:15 +00:00
def load_link_index ( link_dir , link ) :
""" check for an existing link archive in the given directory,
and load + merge it into the given link dict
"""
is_new = not os . path . exists ( link_dir )
if is_new :
os . makedirs ( link_dir )
else :
link = {
* * parse_json_link_index ( link_dir ) ,
* * link ,
}
2017-10-23 09:57:34 +00:00
2019-02-21 22:45:28 +00:00
check_link_structure ( link )
2019-03-08 21:25:15 +00:00
print_link_status_line ( link_dir , link , is_new )
2019-02-21 22:45:28 +00:00
2019-03-08 21:25:15 +00:00
return link
2017-10-23 09:57:34 +00:00
2018-06-10 22:45:41 +00:00
2019-03-08 21:25:15 +00:00
def archive_link ( link_dir , link , overwrite = True ) :
""" download the DOM, PDF, and a screenshot into a folder named after the link ' s timestamp """
2017-10-23 09:57:34 +00:00
2019-03-08 21:25:15 +00:00
ARCHIVE_METHODS = (
( FETCH_TITLE , fetch_title ) ,
( FETCH_FAVICON , fetch_favicon ) ,
( FETCH_WGET , fetch_wget ) ,
( FETCH_PDF , fetch_pdf ) ,
( FETCH_SCREENSHOT , fetch_screenshot ) ,
( FETCH_DOM , fetch_dom ) ,
( FETCH_GIT , fetch_git ) ,
( FETCH_MEDIA , fetch_media ) ,
( SUBMIT_ARCHIVE_DOT_ORG , archive_dot_org ) ,
)
2019-03-11 07:13:59 +00:00
active_methods = [ method for toggle , method in ARCHIVE_METHODS if toggle ]
2019-03-08 21:25:15 +00:00
try :
link = load_link_index ( link_dir , link )
2019-01-11 10:18:49 +00:00
2019-03-08 21:25:15 +00:00
for archive_method in active_methods :
archive_method ( link_dir , link , overwrite = overwrite )
2019-01-11 10:52:29 +00:00
2019-02-04 16:00:08 +00:00
write_link_index ( link_dir , link )
2017-10-23 09:57:34 +00:00
2019-02-04 16:00:08 +00:00
except Exception as err :
2019-02-05 04:02:40 +00:00
print ( ' ! Failed to archive link: {} : {} ' . format ( err . __class__ . __name__ , err ) )
2017-10-23 09:57:34 +00:00
return link
2019-03-08 21:25:15 +00:00
def print_link_status_line ( link_dir , link , is_new ) :
2018-04-17 13:11:27 +00:00
print ( ' [ {symbol_color} {symbol} {reset} ] [ {now} ] " {title} " \n {blue} {url} {reset} ' . format (
2019-03-08 21:25:15 +00:00
symbol = ' + ' if is_new else ' * ' ,
symbol_color = ANSI [ ' green ' if is_new else ' black ' ] ,
2018-04-17 13:11:27 +00:00
now = datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
2019-02-19 06:44:54 +00:00
* * { * * link , ' title ' : link [ ' title ' ] or link [ ' url ' ] } ,
2017-10-30 08:36:42 +00:00
* * ANSI ,
) )
2019-03-08 21:25:15 +00:00
print ( ' > {} {} ' . format ( pretty_path ( link_dir ) , ' (new) ' if is_new else ' ' ) )
2019-02-07 06:06:21 +00:00
# if link['type']:
# print(' i {}'.format(link['type']))
2017-10-30 08:36:42 +00:00
2017-10-23 09:57:34 +00:00
2017-10-18 22:38:17 +00:00
def attach_result_to_link ( method ) :
2017-10-23 09:57:34 +00:00
"""
Instead of returning a result = { output : ' ... ' , status : ' success ' } object ,
attach that result to the links ' s history & latest fields, then return
the updated link object .
"""
2017-10-18 22:38:17 +00:00
def decorator ( fetch_func ) :
@wraps ( fetch_func )
2017-10-30 11:09:33 +00:00
def timed_fetch_func ( link_dir , link , overwrite = False , * * kwargs ) :
2017-10-18 22:38:17 +00:00
# initialize methods and history json field on link
2017-10-23 09:57:34 +00:00
link [ ' latest ' ] = link . get ( ' latest ' ) or { }
link [ ' latest ' ] [ method ] = link [ ' latest ' ] . get ( method ) or None
2017-10-18 22:38:17 +00:00
link [ ' history ' ] = link . get ( ' history ' ) or { }
link [ ' history ' ] [ method ] = link [ ' history ' ] . get ( method ) or [ ]
start_ts = datetime . now ( ) . timestamp ( )
# if a valid method output is already present, dont run the fetch function
2017-10-23 09:57:34 +00:00
if link [ ' latest ' ] [ method ] and not overwrite :
2018-04-17 11:30:06 +00:00
print ( ' √ {} ' . format ( method ) )
2017-10-18 22:38:17 +00:00
result = None
else :
2018-04-17 11:30:06 +00:00
print ( ' > {} ' . format ( method ) )
2017-10-30 11:09:33 +00:00
result = fetch_func ( link_dir , link , * * kwargs )
2017-10-18 22:38:17 +00:00
end_ts = datetime . now ( ) . timestamp ( )
duration = str ( end_ts * 1000 - start_ts * 1000 ) . split ( ' . ' ) [ 0 ]
# append a history item recording fail/success
history_entry = {
' timestamp ' : str ( start_ts ) . split ( ' . ' ) [ 0 ] ,
}
if result is None :
history_entry [ ' status ' ] = ' skipped '
elif isinstance ( result . get ( ' output ' ) , Exception ) :
history_entry [ ' status ' ] = ' failed '
history_entry [ ' duration ' ] = duration
history_entry . update ( result or { } )
link [ ' history ' ] [ method ] . append ( history_entry )
else :
history_entry [ ' status ' ] = ' succeded '
history_entry [ ' duration ' ] = duration
history_entry . update ( result or { } )
link [ ' history ' ] [ method ] . append ( history_entry )
2017-10-23 09:57:34 +00:00
link [ ' latest ' ] [ method ] = result [ ' output ' ]
2018-04-17 21:16:29 +00:00
2017-10-18 22:38:17 +00:00
_RESULTS_TOTALS [ history_entry [ ' status ' ] ] + = 1
return link
return timed_fetch_func
return decorator
@attach_result_to_link ( ' wget ' )
2019-01-15 03:40:55 +00:00
def fetch_wget ( link_dir , link , requisites = FETCH_WGET_REQUISITES , warc = FETCH_WARC , timeout = TIMEOUT ) :
2017-10-18 22:38:17 +00:00
""" download full site using wget """
2019-02-27 20:42:49 +00:00
domain_dir = os . path . join ( link_dir , domain ( link [ ' url ' ] ) )
2018-04-17 21:16:29 +00:00
existing_file = wget_output_path ( link )
if os . path . exists ( domain_dir ) and existing_file :
return { ' output ' : existing_file , ' status ' : ' skipped ' }
2017-10-18 22:38:17 +00:00
2019-01-15 03:40:55 +00:00
if warc :
warc_dir = os . path . join ( link_dir , ' warc ' )
os . makedirs ( warc_dir , exist_ok = True )
warc_path = os . path . join ( ' warc ' , str ( int ( datetime . now ( ) . timestamp ( ) ) ) )
2019-01-12 03:38:50 +00:00
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
2017-10-18 22:38:17 +00:00
CMD = [
2019-02-21 20:47:15 +00:00
WGET_BINARY ,
2019-01-12 03:38:50 +00:00
# '--server-response', # print headers for better error parsing
2019-01-12 03:13:51 +00:00
' --no-verbose ' ,
' --adjust-extension ' ,
' --convert-links ' ,
' --force-directories ' ,
' --backup-converted ' ,
' --span-hosts ' ,
' --no-parent ' ,
2019-02-07 06:06:28 +00:00
' -e ' , ' robots=off ' ,
2019-01-12 03:13:51 +00:00
' --restrict-file-names=unix ' ,
2019-01-20 19:07:43 +00:00
' --timeout= {} ' . format ( timeout ) ,
* ( ( ) if warc else ( ' --timestamping ' , ) ) ,
2019-01-15 03:40:55 +00:00
* ( ( ' --warc-file= {} ' . format ( warc_path ) , ) if warc else ( ) ) ,
2019-01-12 03:13:51 +00:00
* ( ( ' --page-requisites ' , ) if FETCH_WGET_REQUISITES else ( ) ) ,
2019-02-05 04:02:48 +00:00
* ( ( ' --user-agent= {} ' . format ( WGET_USER_AGENT ) , ) if WGET_USER_AGENT else ( ) ) ,
2019-02-21 17:57:16 +00:00
* ( ( ' --load-cookies ' , COOKIES_FILE ) if COOKIES_FILE else ( ) ) ,
2019-02-05 04:37:19 +00:00
* ( ( ( ) if CHECK_SSL_VALIDITY else ( ' --no-check-certificate ' , ' --no-hsts ' ) ) ) ,
2017-10-18 22:38:17 +00:00
link [ ' url ' ] ,
]
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2017-10-18 22:38:17 +00:00
end ( )
2018-04-17 13:11:27 +00:00
output = wget_output_path ( link , look_in = domain_dir )
2018-06-11 01:14:46 +00:00
2019-02-05 03:20:36 +00:00
output_tail = [ ' ' + line for line in ( result . stdout + result . stderr ) . decode ( ) . rsplit ( ' \n ' , 3 ) [ - 3 : ] if line . strip ( ) ]
# parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
files_downloaded = (
int ( output_tail [ - 1 ] . strip ( ) . split ( ' ' , 2 ) [ 1 ] or 0 )
if ' Downloaded: ' in output_tail [ - 1 ]
else 0
)
2018-06-11 01:14:46 +00:00
# Check for common failure cases
2019-02-05 03:20:36 +00:00
if result . returncode > 0 and files_downloaded < 1 :
2019-01-20 19:08:00 +00:00
print ( ' Got wget response code {} : ' . format ( result . returncode ) )
2019-02-05 03:20:36 +00:00
print ( ' \n ' . join ( output_tail ) )
2018-06-17 23:09:01 +00:00
if b ' 403: Forbidden ' in result . stderr :
raise Exception ( ' 403 Forbidden (try changing WGET_USER_AGENT) ' )
if b ' 404: Not Found ' in result . stderr :
raise Exception ( ' 404 Not Found ' )
if b ' ERROR 500: Internal Server Error ' in result . stderr :
raise Exception ( ' 500 Internal Server Error ' )
2019-01-20 19:08:00 +00:00
raise Exception ( ' Got an error from the server ' )
2017-10-18 22:38:17 +00:00
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2017-10-18 22:38:17 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
@attach_result_to_link ( ' pdf ' )
2017-10-30 11:09:33 +00:00
def fetch_pdf ( link_dir , link , timeout = TIMEOUT , user_data_dir = CHROME_USER_DATA_DIR ) :
2017-10-18 22:38:17 +00:00
""" print PDF of site to file using chrome --headless """
if link [ ' type ' ] in ( ' PDF ' , ' image ' ) :
2018-04-17 13:13:38 +00:00
return { ' output ' : wget_output_path ( link ) }
2017-10-18 22:38:17 +00:00
2017-10-30 11:09:33 +00:00
if os . path . exists ( os . path . join ( link_dir , ' output.pdf ' ) ) :
2017-10-18 22:38:17 +00:00
return { ' output ' : ' output.pdf ' , ' status ' : ' skipped ' }
CMD = [
2017-10-30 11:09:33 +00:00
* chrome_headless ( user_data_dir = user_data_dir ) ,
' --print-to-pdf ' ,
2019-01-12 03:38:50 +00:00
' --hide-scrollbars ' ,
2019-01-20 19:08:00 +00:00
' --timeout= {} ' . format ( ( timeout ) * 1000 ) ,
2019-01-12 03:38:50 +00:00
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --disable-web-security ' , ' --ignore-certificate-errors ' ) ) ,
2017-10-18 22:38:17 +00:00
link [ ' url ' ]
]
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2017-10-18 22:38:17 +00:00
end ( )
if result . returncode :
2017-10-18 22:47:19 +00:00
print ( ' ' , ( result . stderr or result . stdout ) . decode ( ) )
2017-10-18 22:38:17 +00:00
raise Exception ( ' Failed to print PDF ' )
2018-03-15 00:04:04 +00:00
chmod_file ( ' output.pdf ' , cwd = link_dir )
2017-10-18 22:38:17 +00:00
output = ' output.pdf '
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2017-10-18 22:38:17 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
@attach_result_to_link ( ' screenshot ' )
2017-10-30 11:09:33 +00:00
def fetch_screenshot ( link_dir , link , timeout = TIMEOUT , user_data_dir = CHROME_USER_DATA_DIR , resolution = RESOLUTION ) :
2017-10-18 22:38:17 +00:00
""" take screenshot of site using chrome --headless """
if link [ ' type ' ] in ( ' PDF ' , ' image ' ) :
2018-04-17 13:13:38 +00:00
return { ' output ' : wget_output_path ( link ) }
2017-10-18 22:38:17 +00:00
2017-10-30 11:09:33 +00:00
if os . path . exists ( os . path . join ( link_dir , ' screenshot.png ' ) ) :
2017-10-18 22:38:17 +00:00
return { ' output ' : ' screenshot.png ' , ' status ' : ' skipped ' }
CMD = [
2017-10-30 11:09:33 +00:00
* chrome_headless ( user_data_dir = user_data_dir ) ,
' --screenshot ' ,
2017-10-18 22:38:17 +00:00
' --window-size= {} ' . format ( resolution ) ,
2018-06-17 23:09:09 +00:00
' --hide-scrollbars ' ,
2019-01-20 19:08:00 +00:00
' --timeout= {} ' . format ( ( timeout ) * 1000 ) ,
2019-01-12 03:38:50 +00:00
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --disable-web-security ' , ' --ignore-certificate-errors ' ) ) ,
2018-06-17 23:09:09 +00:00
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
link [ ' url ' ] ,
2017-10-18 22:38:17 +00:00
]
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2017-10-18 22:38:17 +00:00
end ( )
if result . returncode :
2017-10-18 22:47:19 +00:00
print ( ' ' , ( result . stderr or result . stdout ) . decode ( ) )
2017-10-18 22:38:17 +00:00
raise Exception ( ' Failed to take screenshot ' )
2017-10-30 11:09:33 +00:00
chmod_file ( ' screenshot.png ' , cwd = link_dir )
2017-10-18 22:38:17 +00:00
output = ' screenshot.png '
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2017-10-18 22:38:17 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
2018-06-10 22:45:41 +00:00
@attach_result_to_link ( ' dom ' )
def fetch_dom ( link_dir , link , timeout = TIMEOUT , user_data_dir = CHROME_USER_DATA_DIR ) :
""" print HTML of site to file using chrome --dump-html """
if link [ ' type ' ] in ( ' PDF ' , ' image ' ) :
return { ' output ' : wget_output_path ( link ) }
output_path = os . path . join ( link_dir , ' output.html ' )
if os . path . exists ( output_path ) :
return { ' output ' : ' output.html ' , ' status ' : ' skipped ' }
CMD = [
* chrome_headless ( user_data_dir = user_data_dir ) ,
' --dump-dom ' ,
2019-01-20 19:08:00 +00:00
' --timeout= {} ' . format ( ( timeout ) * 1000 ) ,
2018-06-10 22:45:41 +00:00
link [ ' url ' ]
]
end = progress ( timeout , prefix = ' ' )
try :
with open ( output_path , ' w+ ' ) as f :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = f , stderr = PIPE , cwd = link_dir , timeout = timeout )
2018-06-10 22:45:41 +00:00
end ( )
if result . returncode :
print ( ' ' , ( result . stderr ) . decode ( ) )
raise Exception ( ' Failed to fetch DOM ' )
chmod_file ( ' output.html ' , cwd = link_dir )
output = ' output.html '
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2018-06-10 22:45:41 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
2017-10-18 22:38:17 +00:00
2019-03-08 21:25:15 +00:00
def parse_archive_dot_org_response ( response ) :
# Parse archive.org response headers
headers = defaultdict ( list )
# lowercase all the header names and store in dict
for header in response . splitlines ( ) :
if b ' : ' not in header or not header . strip ( ) :
continue
name , val = header . decode ( ) . split ( ' : ' , 1 )
headers [ name . lower ( ) . strip ( ) ] . append ( val . strip ( ) )
# Get successful archive url in "content-location" header or any errors
content_location = headers [ ' content-location ' ]
errors = headers [ ' x-archive-wayback-runtime-error ' ]
return content_location , errors
2017-10-18 22:38:17 +00:00
@attach_result_to_link ( ' archive_org ' )
2017-10-30 11:09:33 +00:00
def archive_dot_org ( link_dir , link , timeout = TIMEOUT ) :
2017-10-18 22:38:17 +00:00
""" submit site to archive.org for archiving via their service, save returned archive url """
2019-03-08 21:25:15 +00:00
output = ' archive.org.txt '
archive_org_url = None
path = os . path . join ( link_dir , output )
2017-10-18 22:38:17 +00:00
if os . path . exists ( path ) :
archive_org_url = open ( path , ' r ' ) . read ( ) . strip ( )
return { ' output ' : archive_org_url , ' status ' : ' skipped ' }
2018-10-15 11:09:31 +00:00
submit_url = ' https://web.archive.org/save/ {} ' . format ( link [ ' url ' ] )
2019-01-20 17:34:15 +00:00
CMD = [
2019-02-21 20:47:15 +00:00
CURL_BINARY ,
2019-01-20 17:34:15 +00:00
' --location ' ,
' --head ' ,
2019-03-08 21:25:15 +00:00
' --user-agent ' , ' ArchiveBox/ {} (+https://github.com/pirate/ArchiveBox/) ' . format ( GIT_SHA ) , # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
2019-01-20 17:34:15 +00:00
' --max-time ' , str ( timeout ) ,
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --insecure ' , ) ) ,
submit_url ,
]
2017-10-18 22:38:17 +00:00
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = PIPE , stderr = DEVNULL , cwd = link_dir , timeout = timeout )
2017-10-18 22:38:17 +00:00
end ( )
2019-03-08 21:25:15 +00:00
content_location , errors = parse_archive_dot_org_response ( result . stdout )
2017-10-18 22:38:17 +00:00
if content_location :
2019-03-08 21:25:15 +00:00
archive_org_url = ' https://web.archive.org {} ' . format ( content_location [ 0 ] )
2018-04-25 07:49:26 +00:00
elif len ( errors ) == 1 and ' RobotAccessControlException ' in errors [ 0 ] :
2019-03-08 21:25:15 +00:00
archive_org_url = None
2019-02-27 20:42:49 +00:00
# raise Exception('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
2017-10-18 22:38:17 +00:00
elif errors :
2018-04-25 07:49:26 +00:00
raise Exception ( ' , ' . join ( errors ) )
2017-10-18 22:38:17 +00:00
else :
2018-04-17 21:16:41 +00:00
raise Exception ( ' Failed to find " content-location " URL header in Archive.org response. ' )
2017-10-18 22:38:17 +00:00
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
if not isinstance ( output , Exception ) :
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open ( os . path . join ( link_dir , output ) , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( archive_org_url )
2017-10-30 11:09:33 +00:00
chmod_file ( ' archive.org.txt ' , cwd = link_dir )
2019-03-08 21:25:15 +00:00
output = archive_org_url
2017-10-18 22:38:17 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
@attach_result_to_link ( ' favicon ' )
2017-10-30 11:09:33 +00:00
def fetch_favicon ( link_dir , link , timeout = TIMEOUT ) :
2017-10-18 22:38:17 +00:00
""" download site favicon from google ' s favicon api """
2019-03-08 21:25:15 +00:00
output = ' favicon.ico '
if os . path . exists ( os . path . join ( link_dir , output ) ) :
return { ' output ' : output , ' status ' : ' skipped ' }
2017-10-18 22:38:17 +00:00
2019-01-20 19:08:00 +00:00
CMD = [
2019-02-21 20:47:15 +00:00
CURL_BINARY ,
2019-01-20 19:08:00 +00:00
' --max-time ' , str ( timeout ) ,
2019-03-08 21:25:15 +00:00
' --location ' ,
' --output ' , output ,
2019-02-21 20:47:15 +00:00
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --insecure ' , ) ) ,
2019-02-27 20:42:49 +00:00
' https://www.google.com/s2/favicons?domain= {} ' . format ( domain ( link [ ' url ' ] ) ) ,
2019-01-20 19:08:00 +00:00
]
2017-10-18 22:38:17 +00:00
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
run ( CMD , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2017-10-18 22:38:17 +00:00
end ( )
2017-10-30 11:09:33 +00:00
chmod_file ( ' favicon.ico ' , cwd = link_dir )
2017-10-18 22:38:17 +00:00
output = ' favicon.ico '
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2017-10-18 22:38:17 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
2019-02-19 06:44:54 +00:00
@attach_result_to_link ( ' title ' )
def fetch_title ( link_dir , link , timeout = TIMEOUT ) :
""" try to guess the page ' s title from its content """
# if link already has valid title, skip it
if link [ ' title ' ] and not link [ ' title ' ] . lower ( ) . startswith ( ' http ' ) :
2019-02-21 22:45:28 +00:00
return { ' output ' : link [ ' title ' ] , ' status ' : ' skipped ' }
2019-02-19 06:44:54 +00:00
end = progress ( timeout , prefix = ' ' )
try :
title = fetch_page_title ( link [ ' url ' ] , timeout = timeout , progress = False )
end ( )
output = title
except Exception as e :
end ( )
print ( ' {} Failed: {} {} {} ' . format ( ANSI [ ' red ' ] , e . __class__ . __name__ , e , ANSI [ ' reset ' ] ) )
output = e
2019-02-21 22:45:28 +00:00
# titles should show up in the global index immediatley for better UX,
# do a hacky immediate replacement to add them in as we're archiving
# TODO: figure out how to do this without gnarly string replacement
if title :
link [ ' title ' ] = title
patch_index_title_hack ( link [ ' url ' ] , title )
2019-02-19 06:44:54 +00:00
return {
' cmd ' : ' fetch_page_title( " {} " ) ' . format ( link [ ' url ' ] ) ,
' output ' : output ,
}
2019-01-11 10:52:29 +00:00
@attach_result_to_link ( ' media ' )
2019-01-11 11:33:35 +00:00
def fetch_media ( link_dir , link , timeout = MEDIA_TIMEOUT , overwrite = False ) :
2019-01-11 10:52:29 +00:00
""" Download playlists or individual video, audio, and subtitles using youtube-dl """
2019-01-11 11:33:35 +00:00
# import ipdb; ipdb.set_trace()
output = os . path . join ( link_dir , ' media ' )
2019-02-05 03:21:00 +00:00
already_done = os . path . exists ( output ) # and os.listdir(output)
2019-01-11 11:50:42 +00:00
if already_done and not overwrite :
2019-01-11 10:52:29 +00:00
return { ' output ' : ' media ' , ' status ' : ' skipped ' }
2019-01-11 11:33:35 +00:00
os . makedirs ( output , exist_ok = True )
2019-01-11 10:52:29 +00:00
CMD = [
2019-02-21 20:47:15 +00:00
YOUTUBEDL_BINARY ,
2019-01-11 10:52:29 +00:00
' --write-description ' ,
' --write-info-json ' ,
' --write-annotations ' ,
' --yes-playlist ' ,
2019-01-11 11:33:35 +00:00
' --write-thumbnail ' ,
2019-01-11 10:52:29 +00:00
' --no-call-home ' ,
' --no-check-certificate ' ,
2019-01-11 11:33:35 +00:00
' --user-agent ' ,
2019-01-11 10:52:29 +00:00
' --all-subs ' ,
2019-02-21 20:47:15 +00:00
' --extract-audio ' ,
' --keep-video ' ,
' --ignore-errors ' ,
' --geo-bypass ' ,
2019-01-11 10:52:29 +00:00
' --audio-format ' , ' mp3 ' ,
' --audio-quality ' , ' 320K ' ,
' --embed-thumbnail ' ,
' --add-metadata ' ,
2019-02-21 20:47:15 +00:00
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --no-check-certificate ' , ) ) ,
2019-01-11 11:50:42 +00:00
link [ ' url ' ] ,
2019-01-11 10:52:29 +00:00
]
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = PIPE , stderr = PIPE , cwd = output , timeout = timeout + 1 )
2019-02-05 04:15:34 +00:00
chmod_file ( ' media ' , cwd = link_dir )
output = ' media '
2019-01-11 10:52:29 +00:00
end ( )
if result . returncode :
2019-02-05 04:08:54 +00:00
if ( b ' ERROR: Unsupported URL ' in result . stderr
or b ' HTTP Error 404 ' in result . stderr
or b ' HTTP Error 403 ' in result . stderr
2019-02-05 05:18:17 +00:00
or b ' URL could be a direct video link ' in result . stderr
or b ' Unable to extract container ID ' in result . stderr ) :
# These happen too frequently on non-media pages to warrant printing to console
2019-01-11 11:33:35 +00:00
pass
else :
print ( ' got youtubedl response code {} : ' . format ( result . returncode ) )
2019-01-11 11:50:42 +00:00
print ( result . stderr )
2019-01-11 11:33:35 +00:00
raise Exception ( ' Failed to download media ' )
2019-01-11 10:52:29 +00:00
except Exception as e :
end ( )
2019-01-11 11:33:35 +00:00
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2019-01-11 10:52:29 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
2017-10-30 09:01:59 +00:00
2019-01-11 12:02:49 +00:00
2019-01-11 10:18:49 +00:00
@attach_result_to_link ( ' git ' )
def fetch_git ( link_dir , link , timeout = TIMEOUT ) :
""" download full site using git """
2019-02-27 20:55:39 +00:00
url_is_clonable = (
domain ( link [ ' url ' ] ) in GIT_DOMAINS
or link [ ' url ' ] . endswith ( ' .git ' )
or link [ ' type ' ] == ' git '
)
if not url_is_clonable :
return { ' output ' : None , ' status ' : ' skipped ' }
2019-01-11 10:18:49 +00:00
2019-02-27 20:55:39 +00:00
git_dir = os . path . join ( link_dir , ' git ' )
if os . path . exists ( git_dir ) :
2019-01-11 10:18:49 +00:00
return { ' output ' : ' git ' , ' status ' : ' skipped ' }
2019-02-27 20:55:39 +00:00
os . makedirs ( git_dir , exist_ok = True )
output = ' git '
2019-02-21 20:47:15 +00:00
CMD = [
GIT_BINARY ,
' clone ' ,
' --mirror ' ,
' --recursive ' ,
2019-02-27 20:55:39 +00:00
* ( ( ) if CHECK_SSL_VALIDITY else ( ' -c ' , ' http.sslVerify=false ' ) ) ,
without_query ( without_fragment ( link [ ' url ' ] ) ) ,
2019-02-21 20:47:15 +00:00
]
2019-01-11 10:18:49 +00:00
end = progress ( timeout , prefix = ' ' )
try :
2019-03-08 21:25:15 +00:00
result = run ( CMD , stdout = PIPE , stderr = PIPE , cwd = git_dir , timeout = timeout + 1 )
2019-01-11 10:18:49 +00:00
end ( )
2019-02-27 20:55:39 +00:00
if result . returncode == 128 :
# ignore failed re-download when the folder already exists
pass
elif result . returncode > 0 :
2019-01-11 10:18:49 +00:00
print ( ' got git response code {} : ' . format ( result . returncode ) )
raise Exception ( ' Failed git download ' )
except Exception as e :
end ( )
output = e
2019-03-08 21:25:15 +00:00
print_error_hints ( cmd = CMD , pwd = link_dir , err = e )
2019-01-11 10:18:49 +00:00
return {
' cmd ' : CMD ,
' output ' : output ,
}
2019-03-12 21:50:10 +00:00
def chrome_headless ( binary = CHROME_BINARY , user_data_dir = CHROME_USER_DATA_DIR , headless = CHROME_HEADLESS , sandbox = CHROME_SANDBOX ) :
global USER_DATA_DIR
user_data_dir = user_data_dir or USER_DATA_DIR
cmd_args = [ binary ]
if headless :
cmd_args + = ( ' --headless ' , )
if not sandbox :
2019-03-12 16:45:33 +00:00
# dont use GPU or sandbox when running inside docker container
2019-03-12 21:50:10 +00:00
cmd_args + = ( ' --no-sandbox ' , ' --disable-gpu ' )
# Find chrome user data directory
default_profile_paths = (
' ~/.config/chromium ' ,
' ~/.config/google-chrome ' ,
' ~/.config/google-chrome-beta ' ,
' ~/.config/google-chrome-unstable ' ,
' ~/Library/Application Support/Chromium ' ,
' ~/Library/Application Support/Google/Chrome ' ,
' ~/Library/Application Support/Google/Chrome Canary ' ,
' ~/AppData/Local/Chromium/User Data ' ,
' ~/AppData/Local/Google/Chrome/User Data ' ,
' ~/AppData/Local/Google/Chrome SxS/User Data ' ,
)
2017-10-30 09:01:59 +00:00
if user_data_dir :
2019-03-12 21:50:10 +00:00
cmd_args . append ( ' --user-data-dir= {} ' . format ( user_data_dir ) )
else :
for path in default_profile_paths :
full_path = os . path . expanduser ( path )
if os . path . exists ( full_path ) :
USER_DATA_DIR = full_path
cmd_args . append ( ' --user-data-dir= {} ' . format ( full_path ) )
break
return cmd_args
USER_DATA_DIR = CHROME_USER_DATA_DIR