2017-10-18 22:38:17 +00:00
import os
2019-03-27 22:24:30 +00:00
from typing import Dict , List , Tuple , Optional
2018-04-25 07:49:26 +00:00
from collections import defaultdict
2017-10-18 22:38:17 +00:00
from datetime import datetime
2019-04-02 22:53:21 +00:00
from . schema import Link , ArchiveResult , ArchiveOutput
from . index import (
2019-04-17 06:25:28 +00:00
load_link_details ,
write_link_details ,
patch_main_index ,
2019-02-21 22:45:28 +00:00
)
2019-04-02 22:53:21 +00:00
from . config import (
2019-02-21 20:47:15 +00:00
CURL_BINARY ,
GIT_BINARY ,
WGET_BINARY ,
YOUTUBEDL_BINARY ,
2019-02-19 06:44:54 +00:00
FETCH_FAVICON ,
FETCH_TITLE ,
2017-10-18 22:38:17 +00:00
FETCH_WGET ,
FETCH_WGET_REQUISITES ,
FETCH_PDF ,
FETCH_SCREENSHOT ,
2018-06-10 22:45:41 +00:00
FETCH_DOM ,
2019-01-11 12:02:49 +00:00
FETCH_WARC ,
2019-01-11 10:18:49 +00:00
FETCH_GIT ,
2019-01-11 10:52:29 +00:00
FETCH_MEDIA ,
2017-10-18 22:38:17 +00:00
SUBMIT_ARCHIVE_DOT_ORG ,
TIMEOUT ,
2019-01-11 11:33:35 +00:00
MEDIA_TIMEOUT ,
2019-01-11 10:27:25 +00:00
GIT_DOMAINS ,
2019-03-27 19:35:13 +00:00
VERSION ,
2019-03-21 05:28:12 +00:00
WGET_USER_AGENT ,
CHECK_SSL_VALIDITY ,
COOKIES_FILE ,
2019-03-26 09:32:16 +00:00
CURL_VERSION ,
WGET_VERSION ,
CHROME_VERSION ,
GIT_VERSION ,
YOUTUBEDL_VERSION ,
2019-03-27 14:36:29 +00:00
WGET_AUTO_COMPRESSION ,
2017-10-18 22:38:17 +00:00
)
2019-04-02 22:53:21 +00:00
from . util import (
2019-03-27 03:25:07 +00:00
enforce_types ,
2019-02-27 20:42:49 +00:00
domain ,
2019-03-21 05:28:12 +00:00
extension ,
2019-02-27 20:55:39 +00:00
without_query ,
2019-02-21 21:03:19 +00:00
without_fragment ,
2019-02-19 06:44:54 +00:00
fetch_page_title ,
2019-03-21 01:12:43 +00:00
is_static_file ,
2019-03-21 09:35:41 +00:00
TimedProgress ,
2017-10-18 22:38:17 +00:00
chmod_file ,
2019-03-08 21:25:15 +00:00
wget_output_path ,
2019-03-21 05:28:12 +00:00
chrome_args ,
2019-03-26 07:20:41 +00:00
run , PIPE , DEVNULL ,
2019-03-21 05:28:12 +00:00
)
2019-04-02 22:53:21 +00:00
from . logs import (
2019-03-21 05:28:12 +00:00
log_link_archiving_started ,
2019-03-22 18:01:27 +00:00
log_link_archiving_finished ,
2019-03-22 19:09:39 +00:00
log_archive_method_started ,
2019-03-21 09:35:41 +00:00
log_archive_method_finished ,
2017-10-18 22:38:17 +00:00
)
2019-03-21 01:10:09 +00:00
2019-03-27 20:44:00 +00:00
class ArchiveError ( Exception ) :
def __init__ ( self , message , hints = None ) :
super ( ) . __init__ ( message )
self . hints = hints
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def archive_link ( link : Link , out_dir : Optional [ str ] = None ) - > Link :
2019-03-08 21:25:15 +00:00
""" download the DOM, PDF, and a screenshot into a folder named after the link ' s timestamp """
2017-10-23 09:57:34 +00:00
2019-03-08 21:25:15 +00:00
ARCHIVE_METHODS = (
2019-03-21 09:35:41 +00:00
( ' title ' , should_fetch_title , fetch_title ) ,
( ' favicon ' , should_fetch_favicon , fetch_favicon ) ,
( ' wget ' , should_fetch_wget , fetch_wget ) ,
( ' pdf ' , should_fetch_pdf , fetch_pdf ) ,
( ' screenshot ' , should_fetch_screenshot , fetch_screenshot ) ,
( ' dom ' , should_fetch_dom , fetch_dom ) ,
( ' git ' , should_fetch_git , fetch_git ) ,
( ' media ' , should_fetch_media , fetch_media ) ,
( ' archive_org ' , should_fetch_archive_dot_org , archive_dot_org ) ,
2019-03-08 21:25:15 +00:00
)
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-08 21:25:15 +00:00
try :
2019-04-17 06:23:45 +00:00
is_new = not os . path . exists ( out_dir )
2019-03-21 05:28:12 +00:00
if is_new :
2019-04-17 06:23:45 +00:00
os . makedirs ( out_dir )
2019-03-21 05:28:12 +00:00
2019-04-17 06:23:45 +00:00
link = load_link_details ( link , out_dir = out_dir )
log_link_archiving_started ( link , out_dir , is_new )
2019-03-27 07:49:39 +00:00
link = link . overwrite ( updated = datetime . now ( ) )
2019-03-23 03:00:43 +00:00
stats = { ' skipped ' : 0 , ' succeeded ' : 0 , ' failed ' : 0 }
2019-01-11 10:18:49 +00:00
2019-03-21 09:35:41 +00:00
for method_name , should_run , method_function in ARCHIVE_METHODS :
2019-03-27 07:49:39 +00:00
try :
if method_name not in link . history :
link . history [ method_name ] = [ ]
2019-04-17 06:23:45 +00:00
if should_run ( link , out_dir ) :
2019-03-27 07:49:39 +00:00
log_archive_method_started ( method_name )
2019-04-17 06:23:45 +00:00
result = method_function ( link = link , out_dir = out_dir )
2019-03-27 07:49:39 +00:00
link . history [ method_name ] . append ( result )
stats [ result . status ] + = 1
log_archive_method_finished ( result )
else :
stats [ ' skipped ' ] + = 1
except Exception as e :
raise Exception ( ' Exception in archive_methods.fetch_ {} (Link(url= {} )) ' . format (
method_name ,
link . url ,
) ) from e
2019-03-21 09:35:41 +00:00
2019-03-23 03:00:43 +00:00
# print(' ', stats)
2019-03-22 18:01:27 +00:00
2019-04-17 06:23:45 +00:00
write_link_details ( link , out_dir = link . link_dir )
patch_main_index ( link )
2019-03-27 07:49:39 +00:00
2019-03-31 03:47:56 +00:00
# # If any changes were made, update the main links index json and html
# was_changed = stats['succeeded'] or stats['failed']
# if was_changed:
2019-04-17 06:23:45 +00:00
# patch_main_index(link)
2019-03-27 07:49:39 +00:00
2019-03-27 22:24:30 +00:00
log_link_archiving_finished ( link , link . link_dir , is_new , stats )
2019-03-26 23:21:34 +00:00
except KeyboardInterrupt :
2019-03-28 00:49:09 +00:00
try :
2019-04-17 06:23:45 +00:00
write_link_details ( link , out_dir = link . link_dir )
2019-03-28 00:49:09 +00:00
except :
pass
2019-03-26 23:21:34 +00:00
raise
2019-03-22 18:01:27 +00:00
2019-02-04 16:00:08 +00:00
except Exception as err :
2019-02-05 04:02:40 +00:00
print ( ' ! Failed to archive link: {} : {} ' . format ( err . __class__ . __name__ , err ) )
2019-03-21 09:35:41 +00:00
raise
2019-03-26 23:21:34 +00:00
2017-10-23 09:57:34 +00:00
return link
2019-03-22 19:09:39 +00:00
### Archive Method Functions
2019-03-21 05:28:12 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_title ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
2019-03-21 05:28:12 +00:00
# if link already has valid title, skip it
2019-03-26 23:21:34 +00:00
if link . title and not link . title . lower ( ) . startswith ( ' http ' ) :
2019-03-21 09:35:41 +00:00
return False
2019-03-21 05:28:12 +00:00
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_TITLE
2019-03-21 05:28:12 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_title ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2019-03-21 09:35:41 +00:00
""" try to guess the page ' s title from its content """
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = None
2019-03-21 09:35:41 +00:00
cmd = [
CURL_BINARY ,
2019-03-26 23:21:34 +00:00
link . url ,
2019-03-21 09:35:41 +00:00
' | ' ,
' grep ' ,
2019-03-31 00:49:45 +00:00
' <title ' ,
2019-03-21 09:35:41 +00:00
]
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-03-21 05:28:12 +00:00
try :
2019-03-26 23:21:34 +00:00
output = fetch_page_title ( link . url , timeout = timeout , progress = False )
2019-03-21 09:35:41 +00:00
if not output :
raise ArchiveError ( ' Unable to detect page title ' )
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-03-21 05:28:12 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = CURL_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2019-03-21 05:28:12 +00:00
2019-03-21 09:35:41 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_favicon ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
if os . path . exists ( os . path . join ( out_dir , ' favicon.ico ' ) ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_FAVICON
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_favicon ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2019-03-21 05:28:12 +00:00
""" download site favicon from google ' s favicon api """
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' favicon.ico '
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-21 05:28:12 +00:00
CURL_BINARY ,
' --max-time ' , str ( timeout ) ,
' --location ' ,
2019-03-31 00:49:45 +00:00
' --output ' , str ( output ) ,
* ( [ ] if CHECK_SSL_VALIDITY else [ ' --insecure ' ] ) ,
2019-03-26 23:21:34 +00:00
' https://www.google.com/s2/favicons?domain= {} ' . format ( domain ( link . url ) ) ,
2019-03-21 05:28:12 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-03-21 05:28:12 +00:00
try :
2019-04-17 06:23:45 +00:00
run ( cmd , stdout = PIPE , stderr = PIPE , cwd = out_dir , timeout = timeout )
chmod_file ( output , cwd = out_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-03-21 05:28:12 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = CURL_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2017-10-18 22:38:17 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_wget ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
2019-03-21 09:35:41 +00:00
output_path = wget_output_path ( link )
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
if output_path and os . path . exists ( os . path . join ( out_dir , output_path ) ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_WGET
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_wget ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2017-10-18 22:38:17 +00:00
""" download full site using wget """
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-21 05:28:12 +00:00
if FETCH_WARC :
2019-04-17 06:23:45 +00:00
warc_dir = os . path . join ( out_dir , ' warc ' )
2019-01-15 03:40:55 +00:00
os . makedirs ( warc_dir , exist_ok = True )
warc_path = os . path . join ( ' warc ' , str ( int ( datetime . now ( ) . timestamp ( ) ) ) )
2019-01-12 03:38:50 +00:00
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = None
2019-03-21 09:35:41 +00:00
cmd = [
2019-02-21 20:47:15 +00:00
WGET_BINARY ,
2019-01-12 03:38:50 +00:00
# '--server-response', # print headers for better error parsing
2019-01-12 03:13:51 +00:00
' --no-verbose ' ,
' --adjust-extension ' ,
' --convert-links ' ,
' --force-directories ' ,
' --backup-converted ' ,
' --span-hosts ' ,
' --no-parent ' ,
2019-02-07 06:06:28 +00:00
' -e ' , ' robots=off ' ,
2019-04-11 07:40:16 +00:00
' --restrict-file-names=windows ' ,
2019-01-20 19:07:43 +00:00
' --timeout= {} ' . format ( timeout ) ,
2019-03-31 00:49:45 +00:00
* ( [ ] if FETCH_WARC else [ ' --timestamping ' ] ) ,
* ( [ ' --warc-file= {} ' . format ( warc_path ) ] if FETCH_WARC else [ ] ) ,
* ( [ ' --page-requisites ' ] if FETCH_WGET_REQUISITES else [ ] ) ,
* ( [ ' --user-agent= {} ' . format ( WGET_USER_AGENT ) ] if WGET_USER_AGENT else [ ] ) ,
* ( [ ' --load-cookies ' , COOKIES_FILE ] if COOKIES_FILE else [ ] ) ,
* ( [ ' --compression=auto ' ] if WGET_AUTO_COMPRESSION else [ ] ) ,
* ( [ ] if CHECK_SSL_VALIDITY else [ ' --no-check-certificate ' , ' --no-hsts ' ] ) ,
2019-03-26 23:21:34 +00:00
link . url ,
2017-10-18 22:38:17 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-04-17 06:23:45 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = out_dir , timeout = timeout )
2019-03-19 22:09:46 +00:00
output = wget_output_path ( link )
2018-06-11 01:14:46 +00:00
2019-03-21 09:35:41 +00:00
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
2019-03-21 01:10:09 +00:00
output_tail = [
line . strip ( )
for line in ( result . stdout + result . stderr ) . decode ( ) . rsplit ( ' \n ' , 3 ) [ - 3 : ]
if line . strip ( )
]
2019-02-05 03:20:36 +00:00
files_downloaded = (
int ( output_tail [ - 1 ] . strip ( ) . split ( ' ' , 2 ) [ 1 ] or 0 )
if ' Downloaded: ' in output_tail [ - 1 ]
else 0
)
2018-06-11 01:14:46 +00:00
# Check for common failure cases
2019-02-05 03:20:36 +00:00
if result . returncode > 0 and files_downloaded < 1 :
2019-03-21 01:10:09 +00:00
hints = (
2019-03-21 09:35:41 +00:00
' Got wget response code: {} . ' . format ( result . returncode ) ,
2019-03-21 01:10:09 +00:00
* output_tail ,
)
2018-06-17 23:09:01 +00:00
if b ' 403: Forbidden ' in result . stderr :
2019-03-21 01:10:09 +00:00
raise ArchiveError ( ' 403 Forbidden (try changing WGET_USER_AGENT) ' , hints )
2018-06-17 23:09:01 +00:00
if b ' 404: Not Found ' in result . stderr :
2019-03-21 01:10:09 +00:00
raise ArchiveError ( ' 404 Not Found ' , hints )
2018-06-17 23:09:01 +00:00
if b ' ERROR 500: Internal Server Error ' in result . stderr :
2019-03-21 01:10:09 +00:00
raise ArchiveError ( ' 500 Internal Server Error ' , hints )
raise ArchiveError ( ' Got an error from the server ' , hints )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-03-08 21:25:15 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = WGET_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2017-10-18 22:38:17 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_pdf ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
2019-04-17 06:23:45 +00:00
if os . path . exists ( os . path . join ( out_dir , ' output.pdf ' ) ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_PDF
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_pdf ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2017-10-18 22:38:17 +00:00
""" print PDF of site to file using chrome --headless """
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' output.pdf '
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-23 02:05:45 +00:00
* chrome_args ( TIMEOUT = timeout ) ,
2017-10-30 11:09:33 +00:00
' --print-to-pdf ' ,
2019-03-26 23:21:34 +00:00
link . url ,
2017-10-18 22:38:17 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-04-17 06:23:45 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = out_dir , timeout = timeout )
2019-03-21 01:10:09 +00:00
2017-10-18 22:38:17 +00:00
if result . returncode :
2019-03-21 01:10:09 +00:00
hints = ( result . stderr or result . stdout ) . decode ( )
raise ArchiveError ( ' Failed to print PDF ' , hints )
2019-04-17 06:23:45 +00:00
chmod_file ( ' output.pdf ' , cwd = out_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2017-10-18 22:38:17 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = CHROME_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2017-10-18 22:38:17 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_screenshot ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
2019-04-17 06:23:45 +00:00
if os . path . exists ( os . path . join ( out_dir , ' screenshot.png ' ) ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_SCREENSHOT
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_screenshot ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2017-10-18 22:38:17 +00:00
""" take screenshot of site using chrome --headless """
2019-03-31 00:49:45 +00:00
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' screenshot.png '
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-23 02:05:45 +00:00
* chrome_args ( TIMEOUT = timeout ) ,
2017-10-30 11:09:33 +00:00
' --screenshot ' ,
2019-03-26 23:21:34 +00:00
link . url ,
2017-10-18 22:38:17 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-04-17 06:23:45 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = out_dir , timeout = timeout )
2019-03-21 09:35:41 +00:00
2017-10-18 22:38:17 +00:00
if result . returncode :
2019-03-21 01:10:09 +00:00
hints = ( result . stderr or result . stdout ) . decode ( )
raise ArchiveError ( ' Failed to take screenshot ' , hints )
2019-04-17 06:23:45 +00:00
chmod_file ( output , cwd = out_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2017-10-18 22:38:17 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = CHROME_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2019-03-21 09:35:41 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_dom ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
2019-04-17 06:23:45 +00:00
if os . path . exists ( os . path . join ( out_dir , ' output.html ' ) ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_DOM
2017-10-18 22:38:17 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_dom ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2018-06-10 22:45:41 +00:00
""" print HTML of site to file using chrome --dump-html """
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' output.html '
2019-04-17 06:23:45 +00:00
output_path = os . path . join ( out_dir , str ( output ) )
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-23 02:05:45 +00:00
* chrome_args ( TIMEOUT = timeout ) ,
2018-06-10 22:45:41 +00:00
' --dump-dom ' ,
2019-03-26 23:21:34 +00:00
link . url
2018-06-10 22:45:41 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2018-06-10 22:45:41 +00:00
try :
with open ( output_path , ' w+ ' ) as f :
2019-04-17 06:23:45 +00:00
result = run ( cmd , stdout = f , stderr = PIPE , cwd = out_dir , timeout = timeout )
2019-03-21 09:35:41 +00:00
2018-06-10 22:45:41 +00:00
if result . returncode :
2019-03-21 01:10:09 +00:00
hints = result . stderr . decode ( )
raise ArchiveError ( ' Failed to fetch DOM ' , hints )
2019-04-17 06:23:45 +00:00
chmod_file ( output , cwd = out_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2018-06-10 22:45:41 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = CHROME_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2017-10-18 22:38:17 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_git ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
2019-04-17 06:23:45 +00:00
if os . path . exists ( os . path . join ( out_dir , ' git ' ) ) :
2019-03-21 09:35:41 +00:00
return False
2017-10-18 22:38:17 +00:00
2019-03-21 05:28:12 +00:00
is_clonable_url = (
2019-03-26 23:21:34 +00:00
( domain ( link . url ) in GIT_DOMAINS )
or ( extension ( link . url ) == ' git ' )
2019-03-21 05:28:12 +00:00
)
2019-03-21 09:35:41 +00:00
if not is_clonable_url :
return False
2017-10-18 22:38:17 +00:00
2019-03-21 09:35:41 +00:00
return FETCH_GIT
2017-10-18 22:38:17 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_git ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2019-03-21 09:35:41 +00:00
""" download full site using git """
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' git '
2019-04-17 06:23:45 +00:00
output_path = os . path . join ( out_dir , str ( output ) )
2019-03-21 05:28:12 +00:00
os . makedirs ( output_path , exist_ok = True )
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-21 05:28:12 +00:00
GIT_BINARY ,
' clone ' ,
' --mirror ' ,
' --recursive ' ,
2019-03-31 00:49:45 +00:00
* ( [ ] if CHECK_SSL_VALIDITY else [ ' -c ' , ' http.sslVerify=false ' ] ) ,
2019-03-26 23:21:34 +00:00
without_query ( without_fragment ( link . url ) ) ,
2019-01-20 19:08:00 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = output_path , timeout = timeout + 1 )
2019-03-21 05:28:12 +00:00
if result . returncode == 128 :
# ignore failed re-download when the folder already exists
pass
elif result . returncode > 0 :
2019-03-21 09:35:41 +00:00
hints = ' Got git response code: {} . ' . format ( result . returncode )
2019-03-21 05:28:12 +00:00
raise ArchiveError ( ' Failed git download ' , hints )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2017-10-18 22:38:17 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = GIT_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2017-10-18 22:38:17 +00:00
2019-03-22 19:09:39 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_media ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
2019-04-17 06:23:45 +00:00
if os . path . exists ( os . path . join ( out_dir , ' media ' ) ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_MEDIA
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def fetch_media ( link : Link , out_dir : Optional [ str ] = None , timeout : int = MEDIA_TIMEOUT ) - > ArchiveResult :
2019-01-11 10:52:29 +00:00
""" Download playlists or individual video, audio, and subtitles using youtube-dl """
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' media '
2019-04-17 06:23:45 +00:00
output_path = os . path . join ( out_dir , str ( output ) )
2019-03-21 01:10:09 +00:00
os . makedirs ( output_path , exist_ok = True )
2019-03-21 09:35:41 +00:00
cmd = [
2019-02-21 20:47:15 +00:00
YOUTUBEDL_BINARY ,
2019-01-11 10:52:29 +00:00
' --write-description ' ,
' --write-info-json ' ,
' --write-annotations ' ,
' --yes-playlist ' ,
2019-01-11 11:33:35 +00:00
' --write-thumbnail ' ,
2019-01-11 10:52:29 +00:00
' --no-call-home ' ,
' --no-check-certificate ' ,
2019-01-11 11:33:35 +00:00
' --user-agent ' ,
2019-01-11 10:52:29 +00:00
' --all-subs ' ,
2019-02-21 20:47:15 +00:00
' --extract-audio ' ,
' --keep-video ' ,
' --ignore-errors ' ,
' --geo-bypass ' ,
2019-01-11 10:52:29 +00:00
' --audio-format ' , ' mp3 ' ,
' --audio-quality ' , ' 320K ' ,
' --embed-thumbnail ' ,
' --add-metadata ' ,
2019-03-31 00:49:45 +00:00
* ( [ ] if CHECK_SSL_VALIDITY else [ ' --no-check-certificate ' ] ) ,
2019-03-26 23:21:34 +00:00
link . url ,
2019-01-11 10:52:29 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-01-11 10:52:29 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = output_path , timeout = timeout + 1 )
2019-04-17 06:23:45 +00:00
chmod_file ( output , cwd = out_dir )
2019-01-11 10:52:29 +00:00
if result . returncode :
2019-02-05 04:08:54 +00:00
if ( b ' ERROR: Unsupported URL ' in result . stderr
or b ' HTTP Error 404 ' in result . stderr
or b ' HTTP Error 403 ' in result . stderr
2019-02-05 05:18:17 +00:00
or b ' URL could be a direct video link ' in result . stderr
or b ' Unable to extract container ID ' in result . stderr ) :
# These happen too frequently on non-media pages to warrant printing to console
2019-01-11 11:33:35 +00:00
pass
else :
2019-03-21 01:10:09 +00:00
hints = (
2019-03-21 09:35:41 +00:00
' Got youtube-dl response code: {} . ' . format ( result . returncode ) ,
2019-03-21 01:10:09 +00:00
* result . stderr . decode ( ) . split ( ' \n ' ) ,
)
raise ArchiveError ( ' Failed to download media ' , hints )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-01-11 10:52:29 +00:00
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = YOUTUBEDL_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2017-10-30 09:01:59 +00:00
2019-01-11 10:18:49 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def should_fetch_archive_dot_org ( link : Link , out_dir : Optional [ str ] = None ) - > bool :
out_dir = out_dir or link . link_dir
2019-03-26 23:21:34 +00:00
if is_static_file ( link . url ) :
2019-03-21 09:35:41 +00:00
return False
2019-04-17 06:23:45 +00:00
if os . path . exists ( os . path . join ( out_dir , ' archive.org.txt ' ) ) :
2019-03-21 09:35:41 +00:00
# if open(path, 'r').read().strip() != 'None':
return False
return SUBMIT_ARCHIVE_DOT_ORG
2019-03-27 03:25:07 +00:00
@enforce_types
2019-04-17 06:23:45 +00:00
def archive_dot_org ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
2019-03-21 05:28:12 +00:00
""" submit site to archive.org for archiving via their service, save returned archive url """
2019-03-21 01:10:09 +00:00
2019-04-17 06:23:45 +00:00
out_dir = out_dir or link . link_dir
2019-03-31 00:49:45 +00:00
output : ArchiveOutput = ' archive.org.txt '
2019-03-21 05:28:12 +00:00
archive_org_url = None
2019-03-26 23:21:34 +00:00
submit_url = ' https://web.archive.org/save/ {} ' . format ( link . url )
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-21 05:28:12 +00:00
CURL_BINARY ,
' --location ' ,
' --head ' ,
2019-03-27 19:35:13 +00:00
' --user-agent ' , ' ArchiveBox/ {} (+https://github.com/pirate/ArchiveBox/) ' . format ( VERSION ) , # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
2019-03-21 05:28:12 +00:00
' --max-time ' , str ( timeout ) ,
2019-03-31 00:49:45 +00:00
* ( [ ] if CHECK_SSL_VALIDITY else [ ' --insecure ' ] ) ,
2019-03-21 05:28:12 +00:00
submit_url ,
2019-02-21 20:47:15 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-01-11 10:18:49 +00:00
try :
2019-04-17 06:23:45 +00:00
result = run ( cmd , stdout = PIPE , stderr = DEVNULL , cwd = out_dir , timeout = timeout )
2019-03-21 05:28:12 +00:00
content_location , errors = parse_archive_dot_org_response ( result . stdout )
if content_location :
archive_org_url = ' https://web.archive.org {} ' . format ( content_location [ 0 ] )
elif len ( errors ) == 1 and ' RobotAccessControlException ' in errors [ 0 ] :
archive_org_url = None
2019-03-26 23:21:34 +00:00
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
2019-03-21 05:28:12 +00:00
elif errors :
raise ArchiveError ( ' , ' . join ( errors ) )
else :
raise ArchiveError ( ' Failed to find " content-location " URL header in Archive.org response. ' )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-01-11 10:18:49 +00:00
2019-03-31 00:49:45 +00:00
if output and not isinstance ( output , Exception ) :
2019-03-21 05:28:12 +00:00
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
2019-04-17 06:23:45 +00:00
with open ( os . path . join ( out_dir , str ( output ) ) , ' w ' , encoding = ' utf-8 ' ) as f :
2019-03-21 05:28:12 +00:00
f . write ( archive_org_url )
2019-04-17 06:23:45 +00:00
chmod_file ( ' archive.org.txt ' , cwd = out_dir )
2019-03-21 05:28:12 +00:00
output = archive_org_url
2019-03-26 07:20:41 +00:00
return ArchiveResult (
cmd = cmd ,
2019-04-17 06:23:45 +00:00
pwd = out_dir ,
2019-03-26 09:32:16 +00:00
cmd_version = CURL_VERSION ,
2019-03-26 07:20:41 +00:00
output = output ,
status = status ,
2019-03-21 09:35:41 +00:00
* * timer . stats ,
2019-03-26 07:20:41 +00:00
)
2019-01-11 10:18:49 +00:00
2019-03-27 03:25:07 +00:00
@enforce_types
2019-03-26 07:20:41 +00:00
def parse_archive_dot_org_response ( response : bytes ) - > Tuple [ List [ str ] , List [ str ] ] :
2019-03-22 19:09:39 +00:00
# Parse archive.org response headers
2019-03-26 07:20:41 +00:00
headers : Dict [ str , List [ str ] ] = defaultdict ( list )
2019-03-22 19:09:39 +00:00
# lowercase all the header names and store in dict
for header in response . splitlines ( ) :
if b ' : ' not in header or not header . strip ( ) :
continue
name , val = header . decode ( ) . split ( ' : ' , 1 )
headers [ name . lower ( ) . strip ( ) ] . append ( val . strip ( ) )
2019-03-12 21:50:10 +00:00
2019-03-22 19:09:39 +00:00
# Get successful archive url in "content-location" header or any errors
content_location = headers [ ' content-location ' ]
errors = headers [ ' x-archive-wayback-runtime-error ' ]
return content_location , errors