mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 12:13:05 +00:00
move wget and mercury into plugins
This commit is contained in:
parent
dce79d63c6
commit
69522da4bb
6 changed files with 341 additions and 107 deletions
|
@ -59,6 +59,7 @@ from ..misc.logging import (
|
||||||
from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
||||||
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
||||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||||
|
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
|
||||||
|
|
||||||
ANSI = SHELL_CONFIG.ANSI
|
ANSI = SHELL_CONFIG.ANSI
|
||||||
LDAP = LDAP_CONFIG.LDAP_ENABLED
|
LDAP = LDAP_CONFIG.LDAP_ENABLED
|
||||||
|
@ -81,6 +82,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
|
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
|
||||||
|
|
||||||
'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
||||||
|
|
||||||
|
'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
|
||||||
|
|
||||||
|
|
||||||
'ARCHIVE_METHOD_TOGGLES': {
|
'ARCHIVE_METHOD_TOGGLES': {
|
||||||
|
@ -112,7 +115,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
|
|
||||||
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
|
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'},
|
|
||||||
|
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
'COOKIES_FILE': {'type': str, 'default': None},
|
||||||
|
|
||||||
|
@ -143,16 +145,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
|
|
||||||
|
|
||||||
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
|
|
||||||
'--adjust-extension',
|
|
||||||
'--convert-links',
|
|
||||||
'--force-directories',
|
|
||||||
'--backup-converted',
|
|
||||||
'--span-hosts',
|
|
||||||
'--no-parent',
|
|
||||||
'-e', 'robots=off',
|
|
||||||
]},
|
|
||||||
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
|
|
||||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
||||||
'--location',
|
'--location',
|
||||||
'--compressed'
|
'--compressed'
|
||||||
|
@ -161,16 +153,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||||
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
|
|
||||||
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
|
|
||||||
},
|
},
|
||||||
|
|
||||||
'DEPENDENCY_CONFIG': {
|
'DEPENDENCY_CONFIG': {
|
||||||
'USE_CURL': {'type': bool, 'default': True},
|
'USE_CURL': {'type': bool, 'default': True},
|
||||||
'USE_WGET': {'type': bool, 'default': True},
|
|
||||||
'USE_SINGLEFILE': {'type': bool, 'default': True},
|
'USE_SINGLEFILE': {'type': bool, 'default': True},
|
||||||
'USE_READABILITY': {'type': bool, 'default': True},
|
'USE_READABILITY': {'type': bool, 'default': True},
|
||||||
'USE_MERCURY': {'type': bool, 'default': True},
|
|
||||||
'USE_GIT': {'type': bool, 'default': True},
|
'USE_GIT': {'type': bool, 'default': True},
|
||||||
'USE_CHROME': {'type': bool, 'default': True},
|
'USE_CHROME': {'type': bool, 'default': True},
|
||||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||||
|
@ -178,8 +166,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
|
|
||||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||||
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
|
|
||||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
|
|
||||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||||
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||||
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||||
|
@ -232,21 +218,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||||
|
|
||||||
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
|
|
||||||
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
|
|
||||||
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
|
|
||||||
# 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
|
|
||||||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
|
||||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
|
||||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
|
||||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
|
||||||
|
|
||||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
|
||||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY']},
|
|
||||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
|
||||||
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
|
||||||
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
|
||||||
|
|
||||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||||
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||||
|
@ -649,13 +620,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
|
||||||
'enabled': config['USE_CURL'],
|
'enabled': config['USE_CURL'],
|
||||||
'is_valid': bool(config['CURL_VERSION']),
|
'is_valid': bool(config['CURL_VERSION']),
|
||||||
},
|
},
|
||||||
'WGET_BINARY': {
|
# 'WGET_BINARY': {
|
||||||
'path': bin_path(config['WGET_BINARY']),
|
# 'path': bin_path(config['WGET_BINARY']),
|
||||||
'version': config['WGET_VERSION'],
|
# 'version': config['WGET_VERSION'],
|
||||||
'hash': bin_hash(config['WGET_BINARY']),
|
# 'hash': bin_hash(config['WGET_BINARY']),
|
||||||
'enabled': config['USE_WGET'],
|
# 'enabled': config['USE_WGET'],
|
||||||
'is_valid': bool(config['WGET_VERSION']),
|
# 'is_valid': bool(config['WGET_VERSION']),
|
||||||
},
|
# },
|
||||||
# 'NODE_BINARY': {
|
# 'NODE_BINARY': {
|
||||||
# 'path': bin_path(config['NODE_BINARY']),
|
# 'path': bin_path(config['NODE_BINARY']),
|
||||||
# 'version': config['NODE_VERSION'],
|
# 'version': config['NODE_VERSION'],
|
||||||
|
@ -663,13 +634,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
|
||||||
# 'enabled': config['USE_NODE'],
|
# 'enabled': config['USE_NODE'],
|
||||||
# 'is_valid': bool(config['NODE_VERSION']),
|
# 'is_valid': bool(config['NODE_VERSION']),
|
||||||
# },
|
# },
|
||||||
'MERCURY_BINARY': {
|
# 'MERCURY_BINARY': {
|
||||||
'path': bin_path(config['MERCURY_BINARY']),
|
# 'path': bin_path(config['MERCURY_BINARY']),
|
||||||
'version': config['MERCURY_VERSION'],
|
# 'version': config['MERCURY_VERSION'],
|
||||||
'hash': bin_hash(config['MERCURY_BINARY']),
|
# 'hash': bin_hash(config['MERCURY_BINARY']),
|
||||||
'enabled': config['USE_MERCURY'],
|
# 'enabled': config['USE_MERCURY'],
|
||||||
'is_valid': bool(config['MERCURY_VERSION']),
|
# 'is_valid': bool(config['MERCURY_VERSION']),
|
||||||
},
|
# },
|
||||||
'GIT_BINARY': {
|
'GIT_BINARY': {
|
||||||
'path': bin_path(config['GIT_BINARY']),
|
'path': bin_path(config['GIT_BINARY']),
|
||||||
'version': config['GIT_VERSION'],
|
'version': config['GIT_VERSION'],
|
||||||
|
|
|
@ -11,16 +11,9 @@ from archivebox.misc.system import run, atomic_write
|
||||||
from archivebox.misc.util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
dedupe,
|
|
||||||
)
|
|
||||||
from ..config.legacy import (
|
|
||||||
TIMEOUT,
|
|
||||||
SAVE_MERCURY,
|
|
||||||
DEPENDENCIES,
|
|
||||||
MERCURY_VERSION,
|
|
||||||
MERCURY_ARGS,
|
|
||||||
MERCURY_EXTRA_ARGS,
|
|
||||||
)
|
)
|
||||||
|
from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
|
||||||
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,35 +42,36 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
|
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_MERCURY
|
return MERCURY_CONFIG.SAVE_MERCURY
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=MERCURY_CONFIG.MERCURY_TIMEOUT) -> ArchiveResult:
|
||||||
"""download reader friendly version using @postlight/mercury-parser"""
|
"""download reader friendly version using @postlight/mercury-parser"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / get_output_path()
|
output_folder = out_dir.absolute() / get_output_path()
|
||||||
output = get_output_path()
|
output = get_output_path()
|
||||||
|
|
||||||
|
mercury_binary = MERCURY_BINARY.load()
|
||||||
|
assert mercury_binary.abspath and mercury_binary.version
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
options = [
|
|
||||||
*MERCURY_ARGS,
|
|
||||||
*MERCURY_EXTRA_ARGS,
|
|
||||||
]
|
|
||||||
# By default, get plain text version of article
|
# By default, get plain text version of article
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
str(mercury_binary.abspath),
|
||||||
|
*MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
|
||||||
|
'--format=text',
|
||||||
link.url,
|
link.url,
|
||||||
*dedupe(options)
|
|
||||||
]
|
]
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
|
@ -92,7 +86,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
# Get HTML version of article
|
# Get HTML version of article
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
str(mercury_binary.abspath),
|
||||||
|
*MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
|
||||||
link.url
|
link.url
|
||||||
]
|
]
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
|
@ -119,7 +114,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=MERCURY_VERSION,
|
cmd_version=str(mercury_binary.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
|
|
|
@ -6,7 +6,6 @@ from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from archivebox.misc.util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
|
@ -17,22 +16,10 @@ from archivebox.misc.util import (
|
||||||
urldecode,
|
urldecode,
|
||||||
dedupe,
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config.legacy import (
|
from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
|
||||||
WGET_ARGS,
|
|
||||||
WGET_EXTRA_ARGS,
|
|
||||||
TIMEOUT,
|
|
||||||
SAVE_WGET,
|
|
||||||
SAVE_WARC,
|
|
||||||
WGET_BINARY,
|
|
||||||
WGET_VERSION,
|
|
||||||
RESTRICT_FILE_NAMES,
|
|
||||||
CHECK_SSL_VALIDITY,
|
|
||||||
SAVE_WGET_REQUISITES,
|
|
||||||
WGET_AUTO_COMPRESSION,
|
|
||||||
WGET_USER_AGENT,
|
|
||||||
COOKIES_FILE,
|
|
||||||
)
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
|
@ -54,38 +41,43 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Option
|
||||||
if not overwrite and output_path and (out_dir / output_path).exists():
|
if not overwrite and output_path and (out_dir / output_path).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_WGET
|
return WGET_CONFIG.SAVE_WGET
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=WGET_CONFIG.WGET_TIMEOUT) -> ArchiveResult:
|
||||||
"""download full site using wget"""
|
"""download full site using wget"""
|
||||||
|
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
if SAVE_WARC:
|
assert out_dir.exists()
|
||||||
|
|
||||||
|
if WGET_CONFIG.SAVE_WARC:
|
||||||
warc_dir = out_dir / "warc"
|
warc_dir = out_dir / "warc"
|
||||||
warc_dir.mkdir(exist_ok=True)
|
warc_dir.mkdir(exist_ok=True)
|
||||||
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
|
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
|
||||||
|
|
||||||
|
wget_binary = WGET_BINARY.load()
|
||||||
|
assert wget_binary.abspath and wget_binary.version
|
||||||
|
|
||||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||||
output: ArchiveOutput = None
|
output: ArchiveOutput = None
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
*WGET_ARGS,
|
*WGET_CONFIG.WGET_ARGS,
|
||||||
*WGET_EXTRA_ARGS,
|
*WGET_CONFIG.WGET_EXTRA_ARGS,
|
||||||
'--timeout={}'.format(timeout),
|
'--timeout={}'.format(timeout),
|
||||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
*(['--restrict-file-names={}'.format(WGET_CONFIG.WGET_RESTRICT_FILE_NAMES)] if WGET_CONFIG.WGET_RESTRICT_FILE_NAMES else []),
|
||||||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
*(['--warc-file={}'.format(str(warc_path))] if WGET_CONFIG.SAVE_WARC else []),
|
||||||
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
|
*(['--page-requisites'] if WGET_CONFIG.SAVE_WGET_REQUISITES else []),
|
||||||
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
|
*(['--user-agent={}'.format(WGET_CONFIG.WGET_USER_AGENT)] if WGET_CONFIG.WGET_USER_AGENT else []),
|
||||||
*(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
|
*(['--load-cookies', str(WGET_CONFIG.WGET_COOKIES_FILE)] if WGET_CONFIG.WGET_COOKIES_FILE else []),
|
||||||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
*(['--compression=auto'] if WGET_CONFIG.WGET_AUTO_COMPRESSION else []),
|
||||||
*([] if SAVE_WARC else ['--timestamping']),
|
*([] if WGET_CONFIG.SAVE_WARC else ['--timestamping']),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
*([] if WGET_CONFIG.WGET_CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||||
# '--server-response', # print headers for better error parsing
|
# '--server-response', # print headers for better error parsing
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
WGET_BINARY,
|
str(wget_binary.abspath),
|
||||||
*dedupe(options),
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
|
@ -137,7 +129,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=WGET_VERSION,
|
cmd_version=str(wget_binary.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
|
|
82
archivebox/plugins_extractor/mercury/apps.py
Normal file
82
archivebox/plugins_extractor/mercury/apps.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
__package__ = 'plugins_extractor.mercury'
|
||||||
|
|
||||||
|
from typing import List, Optional, Dict
|
||||||
|
from pathlib import Path
|
||||||
|
from subprocess import run
|
||||||
|
|
||||||
|
from pydantic import InstanceOf, Field
|
||||||
|
from pydantic_pkgr import BinProvider, BinName, bin_abspath
|
||||||
|
|
||||||
|
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env
|
||||||
|
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||||
|
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
class MercuryConfig(BaseConfigSet):
|
||||||
|
|
||||||
|
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
|
||||||
|
|
||||||
|
MERCURY_BINARY: str = Field(default='postlight-parser')
|
||||||
|
MERCURY_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
|
||||||
|
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||||
|
|
||||||
|
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
MERCURY_CONFIG = MercuryConfig()
|
||||||
|
|
||||||
|
|
||||||
|
class MercuryBinary(BaseBinary):
|
||||||
|
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||||
|
LIB_NPM_BINPROVIDER.name: {
|
||||||
|
'packages': lambda: ['@postlight/parser@^2.2.3'],
|
||||||
|
'version': lambda: run([str(LIB_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), f'--prefix={LIB_NPM_BINPROVIDER.npm_prefix}', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(),
|
||||||
|
},
|
||||||
|
SYS_NPM_BINPROVIDER.name: {
|
||||||
|
'packages': lambda: [], # never try to install things globally
|
||||||
|
'version': lambda: run([str(SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), '-g', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(),
|
||||||
|
},
|
||||||
|
env.name: {
|
||||||
|
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
MERCURY_BINARY = MercuryBinary()
|
||||||
|
|
||||||
|
|
||||||
|
class MercuryExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'mercury'
|
||||||
|
binary: str = MERCURY_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path | None:
|
||||||
|
return snapshot.link_dir / 'mercury' / 'content.html'
|
||||||
|
|
||||||
|
MERCURY_EXTRACTOR = MercuryExtractor()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MercuryPlugin(BasePlugin):
|
||||||
|
app_label: str = 'mercury'
|
||||||
|
verbose_name: str = 'MERCURY'
|
||||||
|
|
||||||
|
hooks: List[InstanceOf[BaseHook]] = [
|
||||||
|
MERCURY_CONFIG,
|
||||||
|
MERCURY_BINARY,
|
||||||
|
MERCURY_EXTRACTOR,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
PLUGIN = MercuryPlugin()
|
||||||
|
DJANGO_APP = PLUGIN.AppConfig
|
|
@ -1,17 +1,21 @@
|
||||||
|
__package__ = 'plugins_extractor.wget'
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Dict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from subprocess import run, DEVNULL
|
||||||
|
|
||||||
from rich import print
|
from rich import print
|
||||||
from pydantic import InstanceOf, Field, model_validator
|
from pydantic import InstanceOf, Field, model_validator
|
||||||
from pydantic_pkgr import BinProvider, BinName
|
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
from archivebox.extractors.wget import wget_output_path
|
from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||||
|
from .wget_util import wget_output_path
|
||||||
|
|
||||||
|
|
||||||
class WgetConfig(BaseConfigSet):
|
class WgetConfig(BaseConfigSet):
|
||||||
|
@ -34,13 +38,13 @@ class WgetConfig(BaseConfigSet):
|
||||||
]
|
]
|
||||||
WGET_EXTRA_ARGS: List[str] = []
|
WGET_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
WGET_AUTO_COMPRESSION: bool = Field(default=True)
|
|
||||||
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
||||||
WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
|
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||||
WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
|
|
||||||
WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
|
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
|
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
|
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
@model_validator(mode='after')
|
@model_validator(mode='after')
|
||||||
def validate_use_ytdlp(self):
|
def validate_use_ytdlp(self):
|
||||||
|
@ -53,6 +57,22 @@ class WgetConfig(BaseConfigSet):
|
||||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
||||||
print(file=sys.stderr)
|
print(file=sys.stderr)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@property
|
||||||
|
def WGET_AUTO_COMPRESSION(self) -> bool:
|
||||||
|
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
|
||||||
|
return self._WGET_AUTO_COMPRESSION
|
||||||
|
try:
|
||||||
|
cmd = [
|
||||||
|
self.WGET_BINARY,
|
||||||
|
"--compression=auto",
|
||||||
|
"--help",
|
||||||
|
]
|
||||||
|
self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
|
||||||
|
return self._WGET_AUTO_COMPRESSION
|
||||||
|
except (FileNotFoundError, OSError):
|
||||||
|
self._WGET_AUTO_COMPRESSION = False
|
||||||
|
return False
|
||||||
|
|
||||||
WGET_CONFIG = WgetConfig()
|
WGET_CONFIG = WgetConfig()
|
||||||
|
|
||||||
|
@ -60,6 +80,12 @@ WGET_CONFIG = WgetConfig()
|
||||||
class WgetBinary(BaseBinary):
|
class WgetBinary(BaseBinary):
|
||||||
name: BinName = WGET_CONFIG.WGET_BINARY
|
name: BinName = WGET_CONFIG.WGET_BINARY
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||||
|
brew.name: {
|
||||||
|
'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
WGET_BINARY = WgetBinary()
|
WGET_BINARY = WgetBinary()
|
||||||
|
|
||||||
|
|
168
archivebox/plugins_extractor/wget/wget_util.py
Normal file
168
archivebox/plugins_extractor/wget/wget_util.py
Normal file
|
@ -0,0 +1,168 @@
|
||||||
|
__package__ = 'archivebox.extractors'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
from archivebox.misc.util import (
|
||||||
|
enforce_types,
|
||||||
|
without_fragment,
|
||||||
|
without_query,
|
||||||
|
path,
|
||||||
|
domain,
|
||||||
|
urldecode,
|
||||||
|
)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def unsafe_wget_output_path(link) -> Optional[str]:
|
||||||
|
# There used to be a bunch of complex reverse-engineering path mapping logic here,
|
||||||
|
# but it was removed in favor of just walking through the output folder recursively to try to find the
|
||||||
|
# html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
|
||||||
|
# one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
|
||||||
|
# But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
|
||||||
|
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||||
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
||||||
|
for _ in range(4):
|
||||||
|
try:
|
||||||
|
if search_dir.exists():
|
||||||
|
if search_dir.is_dir():
|
||||||
|
html_files = [
|
||||||
|
f for f in search_dir.iterdir()
|
||||||
|
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
||||||
|
]
|
||||||
|
if html_files:
|
||||||
|
return str(html_files[0].relative_to(link.link_dir))
|
||||||
|
|
||||||
|
# sometimes wget'd URLs have no ext and return non-html
|
||||||
|
# e.g. /some/example/rss/all -> some RSS XML content)
|
||||||
|
# /some/other/url.o4g -> some binary unrecognized ext)
|
||||||
|
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
||||||
|
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||||
|
for file_present in search_dir.iterdir():
|
||||||
|
if file_present == last_part_of_url:
|
||||||
|
return str((search_dir / file_present).relative_to(link.link_dir))
|
||||||
|
except OSError:
|
||||||
|
# OSError 36 and others can happen here, caused by trying to check for impossible paths
|
||||||
|
# (paths derived from URLs can often contain illegal unicode characters or be too long,
|
||||||
|
# causing the OS / filesystem to reject trying to open them with a system-level error)
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Move up one directory level
|
||||||
|
search_dir = search_dir.parent
|
||||||
|
|
||||||
|
if str(search_dir) == link.link_dir:
|
||||||
|
break
|
||||||
|
|
||||||
|
# check for literally any file present that isnt an empty folder
|
||||||
|
domain_dir = Path(domain(link.url).replace(":", "+"))
|
||||||
|
files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
|
||||||
|
if files_within:
|
||||||
|
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
||||||
|
|
||||||
|
# abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
|
||||||
|
# that it's better we just pretend it doesnt exist
|
||||||
|
# this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def wget_output_path(link, nocache: bool=False) -> Optional[str]:
|
||||||
|
"""calculate the path to the wgetted .html file, since wget may
|
||||||
|
adjust some paths to be different than the base_url path.
|
||||||
|
|
||||||
|
See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
|
||||||
|
|
||||||
|
WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
|
||||||
|
is basically impossible. Every OS and filesystem have different requirements on what special characters are
|
||||||
|
allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
|
||||||
|
that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
|
||||||
|
accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
|
||||||
|
wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
|
||||||
|
complicated attempt to do this. Here be dragons:
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||||
|
- https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
|
||||||
|
- and probably many more that I didn't realize were caused by this...
|
||||||
|
|
||||||
|
The only constructive thing we could possibly do to this function is to figure out how to remove it.
|
||||||
|
|
||||||
|
Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
|
||||||
|
and pray you never have to deal with the aftermath of someone else's attempt to do so...
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Wget downloads can save in a number of different ways depending on the url:
|
||||||
|
# https://example.com
|
||||||
|
# > example.com/index.html
|
||||||
|
# https://example.com?v=zzVa_tX1OiI
|
||||||
|
# > example.com/index.html@v=zzVa_tX1OiI.html
|
||||||
|
# https://www.example.com/?v=zzVa_tX1OiI
|
||||||
|
# > example.com/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
# https://example.com/abc
|
||||||
|
# > example.com/abc.html
|
||||||
|
# https://example.com/abc/
|
||||||
|
# > example.com/abc/index.html
|
||||||
|
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||||
|
# > example.com/abc@v=zzVa_tX1OiI.html
|
||||||
|
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||||
|
# > example.com/abc/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
# https://example.com/abc/test.html
|
||||||
|
# > example.com/abc/test.html
|
||||||
|
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||||
|
# > example.com/abc/test@v=zzVa_tX1OiI.html
|
||||||
|
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||||
|
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path'
|
||||||
|
|
||||||
|
if not nocache:
|
||||||
|
from django.core.cache import cache
|
||||||
|
cached_result = cache.get(cache_key)
|
||||||
|
if cached_result:
|
||||||
|
return cached_result
|
||||||
|
|
||||||
|
|
||||||
|
# There's also lots of complexity around how the urlencoding and renaming
|
||||||
|
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
|
||||||
|
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
|
||||||
|
# 4 characters, paths with multipe extensions, etc. the list goes on...
|
||||||
|
|
||||||
|
output_path = None
|
||||||
|
try:
|
||||||
|
output_path = unsafe_wget_output_path(link)
|
||||||
|
except Exception as err:
|
||||||
|
pass # better to pretend it just failed to download than expose gnarly OSErrors to users
|
||||||
|
|
||||||
|
# check for unprintable unicode characters
|
||||||
|
# https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||||
|
if output_path:
|
||||||
|
safe_path = output_path.encode('utf-8', 'replace').decode()
|
||||||
|
if output_path != safe_path:
|
||||||
|
# contains unprintable unicode characters that will break other parts of archivebox
|
||||||
|
# better to pretend it doesnt exist and fallback to parent dir than crash archivebox
|
||||||
|
output_path = None
|
||||||
|
|
||||||
|
# check for a path that is just too long to safely handle across different OS's
|
||||||
|
# https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||||
|
if output_path and len(output_path) > 250:
|
||||||
|
output_path = None
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
if not nocache:
|
||||||
|
cache.set(cache_key, output_path)
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
# fallback to just the domain dir
|
||||||
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
||||||
|
if search_dir.is_dir():
|
||||||
|
return domain(link.url).replace(":", "+")
|
||||||
|
|
||||||
|
# fallback to just the domain dir without port
|
||||||
|
search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
|
||||||
|
if search_dir.is_dir():
|
||||||
|
return domain(link.url).split(":", 1)[0]
|
||||||
|
|
||||||
|
return None
|
Loading…
Reference in a new issue