From 18f0f66f1ebaf3a71f4ab35bf88fedcb3ea57ef2 Mon Sep 17 00:00:00 2001 From: misha Date: Sun, 5 Apr 2020 18:22:59 +0300 Subject: [PATCH 1/3] 05042020 --- archivebox/archive_methods.py | 5 +++-- archivebox/config.py | 5 ++++- etc/ArchiveBox.conf.default | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index b2f04f33..75e7be0d 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -30,6 +30,7 @@ from config import ( OUTPUT_DIR, GIT_DOMAINS, GIT_SHA, + CURL_USER_AGENT, WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, @@ -226,7 +227,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=windows', + '--restrict-file-names=nocontrol', '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), @@ -561,7 +562,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): CURL_BINARY, '--location', '--head', - '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from + *(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--max-time', str(timeout), *(() if CHECK_SSL_VALIDITY else ('--insecure',)), submit_url, diff --git a/archivebox/config.py b/archivebox/config.py index 47f1776f..18fe204c 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -35,6 +35,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') +CURL_USER_AGENT = os.getenv('CURL_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') COOKIES_FILE = os.getenv('COOKIES_FILE', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) @@ -192,13 +193,15 @@ try: raise ### Make sure curl is installed - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: + if FETCH_FAVICON or FETCH_TITLE or SUBMIT_ARCHIVE_DOT_ORG: if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') raise SystemExit(1) + CURL_USER_AGENT = CURL_USER_AGENT.format(GIT_SHA=GIT_SHA[:9]) + ### Make sure wget is installed and calculate version if FETCH_WGET or FETCH_WARC: if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index dcb8aeac..9ceeff17 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -40,6 +40,7 @@ #CHECK_SSL_VALIDITY=True #FETCH_WGET_REQUISITES=True #RESOLUTION="1440,900" +#CURL_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #CHROME_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #GIT_DOMAINS="github.com,bitbucket.org,gitlab.com" From bb580533f715a1b40f8534f81ba99591b3f24821 Mon Sep 17 00:00:00 2001 From: comsomisha Date: Sun, 5 Apr 2020 18:35:13 +0300 Subject: [PATCH 2/3] 0504202002 --- archivebox/archive_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 75e7be0d..1ff03027 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -562,7 +562,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): CURL_BINARY, '--location', '--head', - *(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from + *(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--max-time', str(timeout), *(() if CHECK_SSL_VALIDITY else ('--insecure',)), submit_url, From 1aa2a5b0697e09d20f674571c7f1695ee4c354b2 Mon Sep 17 00:00:00 2001 From: misha Date: Wed, 15 Apr 2020 11:54:53 +0300 Subject: [PATCH 3/3] 15042020 --- archivebox/archive_methods.py | 3 ++- archivebox/config.py | 1 + etc/ArchiveBox.conf.default | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 1ff03027..6fd08d0e 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -30,6 +30,7 @@ from config import ( OUTPUT_DIR, GIT_DOMAINS, GIT_SHA, + RESTRICT_FILE_NAMES, CURL_USER_AGENT, WGET_USER_AGENT, CHECK_SSL_VALIDITY, @@ -227,7 +228,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=nocontrol', + *(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()), '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), diff --git a/archivebox/config.py b/archivebox/config.py index 18fe204c..f4907a30 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -34,6 +34,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) +RESTRICT_FILE_NAMES = os.getenv('RESTRICT_FILE_NAMES', 'windows' ) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') CURL_USER_AGENT = os.getenv('CURL_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 9ceeff17..a48ee8e2 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -39,6 +39,7 @@ #CHECK_SSL_VALIDITY=True #FETCH_WGET_REQUISITES=True +#RESTRICT_FILE_NAMES="windows" #RESOLUTION="1440,900" #CURL_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"