Add ARGS and EXTRA_ARGS for Mercury extractor

This commit is contained in:
Ben Muthalaly 2024-03-05 21:15:38 -06:00
parent d8cf09c21e
commit f4deb97f59
2 changed files with 14 additions and 4 deletions

View file

@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
},
@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},

View file

@ -11,13 +11,15 @@ from ..system import run, atomic_write
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_MERCURY,
DEPENDENCIES,
MERCURY_VERSION,
MERCURY_ARGS,
MERCURY_EXTRA_ARGS,
)
from ..logging_util import TimedProgress
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
timer = TimedProgress(timeout, prefix=' ')
try:
output_folder.mkdir(exist_ok=True)
# Get plain text version of article
# later options take precedence
options = [
*MERCURY_ARGS,
*MERCURY_EXTRA_ARGS,
]
# By default, get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
link.url,
"--format=text"
*dedupe(options)
]
result = run(cmd, cwd=out_dir, timeout=timeout)
try: