2017-10-18 22:38:17 +00:00
import os
2018-04-25 07:49:26 +00:00
from collections import defaultdict
2017-10-18 22:38:17 +00:00
from datetime import datetime
2019-02-21 22:45:28 +00:00
from index import (
write_link_index ,
2019-03-21 05:28:12 +00:00
patch_links_index ,
load_json_link_index ,
2019-02-21 22:45:28 +00:00
)
2017-10-18 22:38:17 +00:00
from config import (
2019-02-21 20:47:15 +00:00
CURL_BINARY ,
GIT_BINARY ,
WGET_BINARY ,
YOUTUBEDL_BINARY ,
2019-02-19 06:44:54 +00:00
FETCH_FAVICON ,
FETCH_TITLE ,
2017-10-18 22:38:17 +00:00
FETCH_WGET ,
FETCH_WGET_REQUISITES ,
FETCH_PDF ,
FETCH_SCREENSHOT ,
2018-06-10 22:45:41 +00:00
FETCH_DOM ,
2019-01-11 12:02:49 +00:00
FETCH_WARC ,
2019-01-11 10:18:49 +00:00
FETCH_GIT ,
2019-01-11 10:52:29 +00:00
FETCH_MEDIA ,
2017-10-18 22:38:17 +00:00
SUBMIT_ARCHIVE_DOT_ORG ,
TIMEOUT ,
2019-01-11 11:33:35 +00:00
MEDIA_TIMEOUT ,
2017-10-18 22:38:17 +00:00
ANSI ,
2019-03-21 05:28:12 +00:00
OUTPUT_DIR ,
2019-01-11 10:27:25 +00:00
GIT_DOMAINS ,
2019-02-05 04:02:33 +00:00
GIT_SHA ,
2020-04-15 08:54:53 +00:00
RESTRICT_FILE_NAMES ,
2020-04-05 15:22:59 +00:00
CURL_USER_AGENT ,
2019-03-21 05:28:12 +00:00
WGET_USER_AGENT ,
CHECK_SSL_VALIDITY ,
COOKIES_FILE ,
2019-03-26 23:38:50 +00:00
WGET_AUTO_COMPRESSION
2017-10-18 22:38:17 +00:00
)
from util import (
2019-02-27 20:42:49 +00:00
domain ,
2019-03-21 05:28:12 +00:00
extension ,
2019-02-27 20:55:39 +00:00
without_query ,
2019-02-21 21:03:19 +00:00
without_fragment ,
2019-02-19 06:44:54 +00:00
fetch_page_title ,
2019-03-21 01:12:43 +00:00
is_static_file ,
2019-03-21 09:35:41 +00:00
TimedProgress ,
2017-10-18 22:38:17 +00:00
chmod_file ,
2019-03-08 21:25:15 +00:00
wget_output_path ,
2019-03-21 05:28:12 +00:00
chrome_args ,
2019-03-21 09:35:41 +00:00
check_link_structure ,
run , PIPE , DEVNULL
2019-03-21 05:28:12 +00:00
)
from logs import (
log_link_archiving_started ,
2019-03-22 18:01:27 +00:00
log_link_archiving_finished ,
2019-03-22 19:09:39 +00:00
log_archive_method_started ,
2019-03-21 09:35:41 +00:00
log_archive_method_finished ,
2017-10-18 22:38:17 +00:00
)
2017-10-23 09:57:34 +00:00
2018-06-10 22:45:41 +00:00
2019-03-21 01:10:09 +00:00
class ArchiveError ( Exception ) :
def __init__ ( self , message , hints = None ) :
super ( ) . __init__ ( message )
self . hints = hints
2019-03-21 09:35:41 +00:00
def archive_link ( link_dir , link ) :
2019-03-08 21:25:15 +00:00
""" download the DOM, PDF, and a screenshot into a folder named after the link ' s timestamp """
2017-10-23 09:57:34 +00:00
2019-03-08 21:25:15 +00:00
ARCHIVE_METHODS = (
2019-03-21 09:35:41 +00:00
( ' title ' , should_fetch_title , fetch_title ) ,
( ' favicon ' , should_fetch_favicon , fetch_favicon ) ,
( ' wget ' , should_fetch_wget , fetch_wget ) ,
( ' pdf ' , should_fetch_pdf , fetch_pdf ) ,
( ' screenshot ' , should_fetch_screenshot , fetch_screenshot ) ,
( ' dom ' , should_fetch_dom , fetch_dom ) ,
( ' git ' , should_fetch_git , fetch_git ) ,
( ' media ' , should_fetch_media , fetch_media ) ,
( ' archive_org ' , should_fetch_archive_dot_org , archive_dot_org ) ,
2019-03-08 21:25:15 +00:00
)
try :
2019-03-21 05:28:12 +00:00
is_new = not os . path . exists ( link_dir )
if is_new :
os . makedirs ( link_dir )
link = load_json_link_index ( link_dir , link )
log_link_archiving_started ( link_dir , link , is_new )
2019-03-23 03:00:43 +00:00
stats = { ' skipped ' : 0 , ' succeeded ' : 0 , ' failed ' : 0 }
2019-01-11 10:18:49 +00:00
2019-03-21 09:35:41 +00:00
for method_name , should_run , method_function in ARCHIVE_METHODS :
if method_name not in link [ ' history ' ] :
link [ ' history ' ] [ method_name ] = [ ]
2019-03-22 19:09:39 +00:00
2019-03-23 01:38:24 +00:00
if should_run ( link_dir , link ) :
2019-03-23 03:00:43 +00:00
log_archive_method_started ( method_name )
result = method_function ( link_dir , link )
link [ ' history ' ] [ method_name ] . append ( result )
2019-03-21 09:35:41 +00:00
2019-03-23 03:00:43 +00:00
stats [ result [ ' status ' ] ] + = 1
log_archive_method_finished ( result )
else :
stats [ ' skipped ' ] + = 1
2019-03-21 09:35:41 +00:00
2019-03-23 03:00:43 +00:00
# print(' ', stats)
2019-03-22 18:01:27 +00:00
2019-02-04 16:00:08 +00:00
write_link_index ( link_dir , link )
2019-03-21 05:28:12 +00:00
patch_links_index ( link )
2019-03-23 03:00:43 +00:00
log_link_archiving_finished ( link_dir , link , is_new , stats )
2019-03-22 18:01:27 +00:00
2019-02-04 16:00:08 +00:00
except Exception as err :
2019-02-05 04:02:40 +00:00
print ( ' ! Failed to archive link: {} : {} ' . format ( err . __class__ . __name__ , err ) )
2019-03-21 09:35:41 +00:00
raise
2017-10-23 09:57:34 +00:00
return link
2019-03-22 19:09:39 +00:00
### Archive Method Functions
2019-03-21 05:28:12 +00:00
2019-03-21 09:35:41 +00:00
def should_fetch_title ( link_dir , link ) :
2019-03-21 05:28:12 +00:00
# if link already has valid title, skip it
if link [ ' title ' ] and not link [ ' title ' ] . lower ( ) . startswith ( ' http ' ) :
2019-03-21 09:35:41 +00:00
return False
2019-03-21 05:28:12 +00:00
if is_static_file ( link [ ' url ' ] ) :
2019-03-21 09:35:41 +00:00
return False
return FETCH_TITLE
2019-03-21 05:28:12 +00:00
2019-03-21 09:35:41 +00:00
def fetch_title ( link_dir , link , timeout = TIMEOUT ) :
""" try to guess the page ' s title from its content """
output = None
cmd = [
CURL_BINARY ,
link [ ' url ' ] ,
' | ' ,
' grep ' ,
' <title> ' ,
]
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-03-21 05:28:12 +00:00
try :
2019-03-21 09:35:41 +00:00
output = fetch_page_title ( link [ ' url ' ] , timeout = timeout , progress = False )
if not output :
raise ArchiveError ( ' Unable to detect page title ' )
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-03-21 05:28:12 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2019-03-21 05:28:12 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2019-03-21 05:28:12 +00:00
}
2019-03-21 09:35:41 +00:00
def should_fetch_favicon ( link_dir , link ) :
if os . path . exists ( os . path . join ( link_dir , ' favicon.ico ' ) ) :
return False
return FETCH_FAVICON
2019-03-21 05:28:12 +00:00
def fetch_favicon ( link_dir , link , timeout = TIMEOUT ) :
""" download site favicon from google ' s favicon api """
output = ' favicon.ico '
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-21 05:28:12 +00:00
CURL_BINARY ,
' --max-time ' , str ( timeout ) ,
' --location ' ,
' --output ' , output ,
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --insecure ' , ) ) ,
' https://www.google.com/s2/favicons?domain= {} ' . format ( domain ( link [ ' url ' ] ) ) ,
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-03-21 05:28:12 +00:00
try :
2019-03-21 09:35:41 +00:00
run ( cmd , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2019-03-21 05:28:12 +00:00
chmod_file ( output , cwd = link_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-03-21 05:28:12 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2019-03-21 05:28:12 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2019-03-21 05:28:12 +00:00
}
2017-10-18 22:38:17 +00:00
2019-03-21 09:35:41 +00:00
def should_fetch_wget ( link_dir , link ) :
output_path = wget_output_path ( link )
if output_path and os . path . exists ( os . path . join ( link_dir , output_path ) ) :
return False
return FETCH_WGET
2019-03-21 05:28:12 +00:00
def fetch_wget ( link_dir , link , timeout = TIMEOUT ) :
2017-10-18 22:38:17 +00:00
""" download full site using wget """
2019-03-21 05:28:12 +00:00
if FETCH_WARC :
2019-01-15 03:40:55 +00:00
warc_dir = os . path . join ( link_dir , ' warc ' )
os . makedirs ( warc_dir , exist_ok = True )
warc_path = os . path . join ( ' warc ' , str ( int ( datetime . now ( ) . timestamp ( ) ) ) )
2019-01-12 03:38:50 +00:00
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
2019-03-21 09:35:41 +00:00
output = None
cmd = [
2019-02-21 20:47:15 +00:00
WGET_BINARY ,
2019-01-12 03:38:50 +00:00
# '--server-response', # print headers for better error parsing
2019-01-12 03:13:51 +00:00
' --no-verbose ' ,
' --adjust-extension ' ,
' --convert-links ' ,
' --force-directories ' ,
' --backup-converted ' ,
' --span-hosts ' ,
' --no-parent ' ,
2019-02-07 06:06:28 +00:00
' -e ' , ' robots=off ' ,
2020-04-15 08:54:53 +00:00
* ( ( ' --restrict-file-names= {} ' . format ( RESTRICT_FILE_NAMES ) , ) if RESTRICT_FILE_NAMES else ( ) ) ,
2019-01-20 19:07:43 +00:00
' --timeout= {} ' . format ( timeout ) ,
2019-03-26 23:38:50 +00:00
* ( ( ' --compression=auto ' , ) if WGET_AUTO_COMPRESSION else ( ) ) ,
2019-03-21 05:28:12 +00:00
* ( ( ) if FETCH_WARC else ( ' --timestamping ' , ) ) ,
* ( ( ' --warc-file= {} ' . format ( warc_path ) , ) if FETCH_WARC else ( ) ) ,
2019-01-12 03:13:51 +00:00
* ( ( ' --page-requisites ' , ) if FETCH_WGET_REQUISITES else ( ) ) ,
2019-02-05 04:02:48 +00:00
* ( ( ' --user-agent= {} ' . format ( WGET_USER_AGENT ) , ) if WGET_USER_AGENT else ( ) ) ,
2019-02-21 17:57:16 +00:00
* ( ( ' --load-cookies ' , COOKIES_FILE ) if COOKIES_FILE else ( ) ) ,
2019-02-05 04:37:19 +00:00
* ( ( ( ) if CHECK_SSL_VALIDITY else ( ' --no-check-certificate ' , ' --no-hsts ' ) ) ) ,
2017-10-18 22:38:17 +00:00
link [ ' url ' ] ,
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2019-03-19 22:09:46 +00:00
output = wget_output_path ( link )
2018-06-11 01:14:46 +00:00
2019-03-21 09:35:41 +00:00
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
2019-03-21 01:10:09 +00:00
output_tail = [
line . strip ( )
for line in ( result . stdout + result . stderr ) . decode ( ) . rsplit ( ' \n ' , 3 ) [ - 3 : ]
if line . strip ( )
]
2019-02-05 03:20:36 +00:00
files_downloaded = (
int ( output_tail [ - 1 ] . strip ( ) . split ( ' ' , 2 ) [ 1 ] or 0 )
if ' Downloaded: ' in output_tail [ - 1 ]
else 0
)
2018-06-11 01:14:46 +00:00
# Check for common failure cases
2019-02-05 03:20:36 +00:00
if result . returncode > 0 and files_downloaded < 1 :
2019-03-21 01:10:09 +00:00
hints = (
2019-03-21 09:35:41 +00:00
' Got wget response code: {} . ' . format ( result . returncode ) ,
2019-03-21 01:10:09 +00:00
* output_tail ,
)
2018-06-17 23:09:01 +00:00
if b ' 403: Forbidden ' in result . stderr :
2019-03-21 01:10:09 +00:00
raise ArchiveError ( ' 403 Forbidden (try changing WGET_USER_AGENT) ' , hints )
2018-06-17 23:09:01 +00:00
if b ' 404: Not Found ' in result . stderr :
2019-03-21 01:10:09 +00:00
raise ArchiveError ( ' 404 Not Found ' , hints )
2018-06-17 23:09:01 +00:00
if b ' ERROR 500: Internal Server Error ' in result . stderr :
2019-03-21 01:10:09 +00:00
raise ArchiveError ( ' 500 Internal Server Error ' , hints )
raise ArchiveError ( ' Got an error from the server ' , hints )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-03-08 21:25:15 +00:00
2017-10-18 22:38:17 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2017-10-18 22:38:17 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2017-10-18 22:38:17 +00:00
}
2019-03-21 09:35:41 +00:00
def should_fetch_pdf ( link_dir , link ) :
if is_static_file ( link [ ' url ' ] ) :
return False
if os . path . exists ( os . path . join ( link_dir , ' output.pdf ' ) ) :
return False
return FETCH_PDF
2019-03-21 05:28:12 +00:00
def fetch_pdf ( link_dir , link , timeout = TIMEOUT ) :
2017-10-18 22:38:17 +00:00
""" print PDF of site to file using chrome --headless """
2019-03-21 01:10:09 +00:00
output = ' output.pdf '
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-23 02:05:45 +00:00
* chrome_args ( TIMEOUT = timeout ) ,
2017-10-30 11:09:33 +00:00
' --print-to-pdf ' ,
2019-03-21 09:35:41 +00:00
link [ ' url ' ] ,
2017-10-18 22:38:17 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2019-03-21 01:10:09 +00:00
2017-10-18 22:38:17 +00:00
if result . returncode :
2019-03-21 01:10:09 +00:00
hints = ( result . stderr or result . stdout ) . decode ( )
raise ArchiveError ( ' Failed to print PDF ' , hints )
2018-03-15 00:04:04 +00:00
chmod_file ( ' output.pdf ' , cwd = link_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2017-10-18 22:38:17 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2017-10-18 22:38:17 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2017-10-18 22:38:17 +00:00
}
2019-03-21 09:35:41 +00:00
def should_fetch_screenshot ( link_dir , link ) :
if is_static_file ( link [ ' url ' ] ) :
return False
if os . path . exists ( os . path . join ( link_dir , ' screenshot.png ' ) ) :
return False
return FETCH_SCREENSHOT
2019-03-21 05:28:12 +00:00
def fetch_screenshot ( link_dir , link , timeout = TIMEOUT ) :
2017-10-18 22:38:17 +00:00
""" take screenshot of site using chrome --headless """
2019-03-21 01:10:09 +00:00
output = ' screenshot.png '
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-23 02:05:45 +00:00
* chrome_args ( TIMEOUT = timeout ) ,
2017-10-30 11:09:33 +00:00
' --screenshot ' ,
2018-06-17 23:09:09 +00:00
link [ ' url ' ] ,
2017-10-18 22:38:17 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = link_dir , timeout = timeout )
2017-10-18 22:38:17 +00:00
if result . returncode :
2019-03-21 01:10:09 +00:00
hints = ( result . stderr or result . stdout ) . decode ( )
raise ArchiveError ( ' Failed to take screenshot ' , hints )
chmod_file ( output , cwd = link_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2017-10-18 22:38:17 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2017-10-18 22:38:17 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2017-10-18 22:38:17 +00:00
}
2019-03-21 09:35:41 +00:00
def should_fetch_dom ( link_dir , link ) :
if is_static_file ( link [ ' url ' ] ) :
return False
if os . path . exists ( os . path . join ( link_dir , ' output.html ' ) ) :
return False
return FETCH_DOM
2017-10-18 22:38:17 +00:00
2019-03-21 05:28:12 +00:00
def fetch_dom ( link_dir , link , timeout = TIMEOUT ) :
2018-06-10 22:45:41 +00:00
""" print HTML of site to file using chrome --dump-html """
2019-03-21 01:10:09 +00:00
output = ' output.html '
2019-03-21 05:28:12 +00:00
output_path = os . path . join ( link_dir , output )
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-23 02:05:45 +00:00
* chrome_args ( TIMEOUT = timeout ) ,
2018-06-10 22:45:41 +00:00
' --dump-dom ' ,
link [ ' url ' ]
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2018-06-10 22:45:41 +00:00
try :
with open ( output_path , ' w+ ' ) as f :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = f , stderr = PIPE , cwd = link_dir , timeout = timeout )
2018-06-10 22:45:41 +00:00
if result . returncode :
2019-03-21 01:10:09 +00:00
hints = result . stderr . decode ( )
raise ArchiveError ( ' Failed to fetch DOM ' , hints )
chmod_file ( output , cwd = link_dir )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2018-06-10 22:45:41 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2018-06-10 22:45:41 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2018-06-10 22:45:41 +00:00
}
2017-10-18 22:38:17 +00:00
2019-03-21 09:35:41 +00:00
def should_fetch_git ( link_dir , link ) :
if is_static_file ( link [ ' url ' ] ) :
return False
if os . path . exists ( os . path . join ( link_dir , ' git ' ) ) :
return False
2017-10-18 22:38:17 +00:00
2019-03-21 05:28:12 +00:00
is_clonable_url = (
2019-03-22 19:09:39 +00:00
( domain ( link [ ' url ' ] ) in GIT_DOMAINS )
or ( extension ( link [ ' url ' ] ) == ' git ' )
2019-03-21 05:28:12 +00:00
)
2019-03-21 09:35:41 +00:00
if not is_clonable_url :
return False
2017-10-18 22:38:17 +00:00
2019-03-21 09:35:41 +00:00
return FETCH_GIT
2017-10-18 22:38:17 +00:00
2019-03-21 09:35:41 +00:00
def fetch_git ( link_dir , link , timeout = TIMEOUT ) :
""" download full site using git """
output = ' git '
output_path = os . path . join ( link_dir , ' git ' )
2019-03-21 05:28:12 +00:00
os . makedirs ( output_path , exist_ok = True )
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-21 05:28:12 +00:00
GIT_BINARY ,
' clone ' ,
' --mirror ' ,
' --recursive ' ,
* ( ( ) if CHECK_SSL_VALIDITY else ( ' -c ' , ' http.sslVerify=false ' ) ) ,
without_query ( without_fragment ( link [ ' url ' ] ) ) ,
2019-01-20 19:08:00 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2017-10-18 22:38:17 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = output_path , timeout = timeout + 1 )
2019-03-21 05:28:12 +00:00
if result . returncode == 128 :
# ignore failed re-download when the folder already exists
pass
elif result . returncode > 0 :
2019-03-21 09:35:41 +00:00
hints = ' Got git response code: {} . ' . format ( result . returncode )
2019-03-21 05:28:12 +00:00
raise ArchiveError ( ' Failed git download ' , hints )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2017-10-18 22:38:17 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2017-10-18 22:38:17 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2017-10-18 22:38:17 +00:00
}
2019-03-22 19:09:39 +00:00
2019-03-21 09:35:41 +00:00
def should_fetch_media ( link_dir , link ) :
if is_static_file ( link [ ' url ' ] ) :
return False
if os . path . exists ( os . path . join ( link_dir , ' media ' ) ) :
return False
return FETCH_MEDIA
def fetch_media ( link_dir , link , timeout = MEDIA_TIMEOUT ) :
2019-01-11 10:52:29 +00:00
""" Download playlists or individual video, audio, and subtitles using youtube-dl """
2019-03-21 01:10:09 +00:00
output = ' media '
output_path = os . path . join ( link_dir , ' media ' )
os . makedirs ( output_path , exist_ok = True )
2019-03-21 09:35:41 +00:00
cmd = [
2019-02-21 20:47:15 +00:00
YOUTUBEDL_BINARY ,
2019-01-11 10:52:29 +00:00
' --write-description ' ,
' --write-info-json ' ,
' --write-annotations ' ,
' --yes-playlist ' ,
2019-01-11 11:33:35 +00:00
' --write-thumbnail ' ,
2019-01-11 10:52:29 +00:00
' --no-call-home ' ,
' --no-check-certificate ' ,
2019-01-11 11:33:35 +00:00
' --user-agent ' ,
2019-01-11 10:52:29 +00:00
' --all-subs ' ,
2019-02-21 20:47:15 +00:00
' --extract-audio ' ,
' --keep-video ' ,
' --ignore-errors ' ,
' --geo-bypass ' ,
2019-01-11 10:52:29 +00:00
' --audio-format ' , ' mp3 ' ,
' --audio-quality ' , ' 320K ' ,
' --embed-thumbnail ' ,
' --add-metadata ' ,
2019-02-21 20:47:15 +00:00
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --no-check-certificate ' , ) ) ,
2019-01-11 11:50:42 +00:00
link [ ' url ' ] ,
2019-01-11 10:52:29 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-01-11 10:52:29 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = PIPE , cwd = output_path , timeout = timeout + 1 )
2019-03-21 01:10:09 +00:00
chmod_file ( output , cwd = link_dir )
2019-01-11 10:52:29 +00:00
if result . returncode :
2019-02-05 04:08:54 +00:00
if ( b ' ERROR: Unsupported URL ' in result . stderr
or b ' HTTP Error 404 ' in result . stderr
or b ' HTTP Error 403 ' in result . stderr
2019-02-05 05:18:17 +00:00
or b ' URL could be a direct video link ' in result . stderr
or b ' Unable to extract container ID ' in result . stderr ) :
# These happen too frequently on non-media pages to warrant printing to console
2019-01-11 11:33:35 +00:00
pass
else :
2019-03-21 01:10:09 +00:00
hints = (
2019-03-21 09:35:41 +00:00
' Got youtube-dl response code: {} . ' . format ( result . returncode ) ,
2019-03-21 01:10:09 +00:00
* result . stderr . decode ( ) . split ( ' \n ' ) ,
)
raise ArchiveError ( ' Failed to download media ' , hints )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-01-11 10:52:29 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2019-01-11 10:52:29 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2019-01-11 10:52:29 +00:00
}
2017-10-30 09:01:59 +00:00
2019-01-11 10:18:49 +00:00
2019-03-21 09:35:41 +00:00
def should_fetch_archive_dot_org ( link_dir , link ) :
if is_static_file ( link [ ' url ' ] ) :
return False
if os . path . exists ( os . path . join ( link_dir , ' archive.org.txt ' ) ) :
# if open(path, 'r').read().strip() != 'None':
return False
return SUBMIT_ARCHIVE_DOT_ORG
2019-03-21 05:28:12 +00:00
def archive_dot_org ( link_dir , link , timeout = TIMEOUT ) :
""" submit site to archive.org for archiving via their service, save returned archive url """
2019-03-21 01:10:09 +00:00
2019-03-21 05:28:12 +00:00
output = ' archive.org.txt '
archive_org_url = None
submit_url = ' https://web.archive.org/save/ {} ' . format ( link [ ' url ' ] )
2019-03-21 09:35:41 +00:00
cmd = [
2019-03-21 05:28:12 +00:00
CURL_BINARY ,
' --location ' ,
' --head ' ,
2020-04-05 15:35:13 +00:00
* ( ( ' --user-agent ' , ' {} ' . format ( CURL_USER_AGENT ) , ) if CURL_USER_AGENT else ( ) ) , # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
2019-03-21 05:28:12 +00:00
' --max-time ' , str ( timeout ) ,
* ( ( ) if CHECK_SSL_VALIDITY else ( ' --insecure ' , ) ) ,
submit_url ,
2019-02-21 20:47:15 +00:00
]
2019-03-21 09:35:41 +00:00
status = ' succeeded '
timer = TimedProgress ( timeout , prefix = ' ' )
2019-01-11 10:18:49 +00:00
try :
2019-03-21 09:35:41 +00:00
result = run ( cmd , stdout = PIPE , stderr = DEVNULL , cwd = link_dir , timeout = timeout )
2019-03-21 05:28:12 +00:00
content_location , errors = parse_archive_dot_org_response ( result . stdout )
if content_location :
archive_org_url = ' https://web.archive.org {} ' . format ( content_location [ 0 ] )
elif len ( errors ) == 1 and ' RobotAccessControlException ' in errors [ 0 ] :
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
elif errors :
raise ArchiveError ( ' , ' . join ( errors ) )
else :
raise ArchiveError ( ' Failed to find " content-location " URL header in Archive.org response. ' )
2019-03-21 09:35:41 +00:00
except Exception as err :
status = ' failed '
output = err
finally :
timer . end ( )
2019-01-11 10:18:49 +00:00
2019-03-21 05:28:12 +00:00
if not isinstance ( output , Exception ) :
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open ( os . path . join ( link_dir , output ) , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( archive_org_url )
chmod_file ( ' archive.org.txt ' , cwd = link_dir )
output = archive_org_url
2019-01-11 10:18:49 +00:00
return {
2019-03-21 09:35:41 +00:00
' cmd ' : cmd ,
' pwd ' : link_dir ,
2019-01-11 10:18:49 +00:00
' output ' : output ,
2019-03-21 09:35:41 +00:00
' status ' : status ,
* * timer . stats ,
2019-01-11 10:18:49 +00:00
}
2019-03-22 19:09:39 +00:00
def parse_archive_dot_org_response ( response ) :
# Parse archive.org response headers
headers = defaultdict ( list )
# lowercase all the header names and store in dict
for header in response . splitlines ( ) :
if b ' : ' not in header or not header . strip ( ) :
continue
name , val = header . decode ( ) . split ( ' : ' , 1 )
headers [ name . lower ( ) . strip ( ) ] . append ( val . strip ( ) )
2019-03-12 21:50:10 +00:00
2019-03-22 19:09:39 +00:00
# Get successful archive url in "content-location" header or any errors
content_location = headers [ ' content-location ' ]
errors = headers [ ' x-archive-wayback-runtime-error ' ]
return content_location , errors