mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-29 23:50:22 +00:00
better config system
This commit is contained in:
parent
77ab8ebda6
commit
1249493fcd
2 changed files with 84 additions and 78 deletions
|
@ -7,7 +7,6 @@ from subprocess import run, PIPE, DEVNULL
|
|||
from index import html_appended_url, parse_json_link_index, write_link_index
|
||||
from links import links_after_timestamp
|
||||
from config import (
|
||||
ARCHIVE_PERMISSIONS,
|
||||
ARCHIVE_DIR,
|
||||
CHROME_BINARY,
|
||||
FETCH_WGET,
|
||||
|
@ -29,26 +28,90 @@ from util import (
|
|||
chmod_file,
|
||||
)
|
||||
|
||||
_RESULTS_TOTALS = {
|
||||
|
||||
_RESULTS_TOTALS = { # globals are bad, mmkay
|
||||
'skipped': 0,
|
||||
'succeded': 0,
|
||||
'failed': 0,
|
||||
}
|
||||
|
||||
|
||||
def archive_links(out_dir, links, export_path, resume=None):
|
||||
check_dependencies()
|
||||
|
||||
to_archive = links_after_timestamp(links, resume)
|
||||
try:
|
||||
for idx, link in enumerate(to_archive):
|
||||
out_dir = os.path.join(out_dir, link['timestamp'])
|
||||
archive_link(out_dir, link)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||
print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format(
|
||||
**ANSI,
|
||||
idx=idx,
|
||||
total=len(list(to_archive)),
|
||||
))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' ./archive.py {} {}'.format(
|
||||
export_path,
|
||||
link['timestamp'],
|
||||
))
|
||||
if not isinstance(e, KeyboardInterrupt):
|
||||
raise e
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
def archive_link(out_dir, link, overwrite=False):
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
link = {**parse_json_link_index(out_dir), **link}
|
||||
log_link_archive(out_dir, link)
|
||||
|
||||
if FETCH_WGET:
|
||||
link = fetch_wget(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_PDF:
|
||||
link = fetch_pdf(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_SCREENSHOT:
|
||||
link = fetch_screenshot(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if SUBMIT_ARCHIVE_DOT_ORG:
|
||||
link = archive_dot_org(out_dir, link, overwrite=overwrite)
|
||||
|
||||
# if FETCH_AUDIO:
|
||||
# link = fetch_audio(out_dir, link, overwrite=overwrite)
|
||||
|
||||
# if FETCH_VIDEO:
|
||||
# link = fetch_video(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_FAVICON:
|
||||
link = fetch_favicon(out_dir, link, overwrite=overwrite)
|
||||
|
||||
write_link_index(out_dir, link)
|
||||
|
||||
return link
|
||||
|
||||
|
||||
def attach_result_to_link(method):
|
||||
"""
|
||||
Instead of returning a result={output:'...', status:'success'} object,
|
||||
attach that result to the links's history & latest fields, then return
|
||||
the updated link object.
|
||||
"""
|
||||
def decorator(fetch_func):
|
||||
@wraps(fetch_func)
|
||||
def timed_fetch_func(out_dir, link, overwrite=False, **kwargs):
|
||||
# initialize methods and history json field on link
|
||||
link['methods'] = link.get('methods') or {}
|
||||
link['methods'][method] = link['methods'].get(method) or None
|
||||
link['latest'] = link.get('latest') or {}
|
||||
link['latest'][method] = link['latest'].get(method) or None
|
||||
link['history'] = link.get('history') or {}
|
||||
link['history'][method] = link['history'].get(method) or []
|
||||
|
||||
start_ts = datetime.now().timestamp()
|
||||
|
||||
# if a valid method output is already present, dont run the fetch function
|
||||
if link['methods'][method] and not overwrite:
|
||||
if link['latest'][method] and not overwrite:
|
||||
print(' √ Skipping: {}'.format(method))
|
||||
result = None
|
||||
else:
|
||||
|
@ -74,7 +137,7 @@ def attach_result_to_link(method):
|
|||
history_entry['duration'] = duration
|
||||
history_entry.update(result or {})
|
||||
link['history'][method].append(history_entry)
|
||||
link['methods'][method] = result['output']
|
||||
link['latest'][method] = result['output']
|
||||
|
||||
_RESULTS_TOTALS[history_entry['status']] += 1
|
||||
|
||||
|
@ -105,7 +168,6 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)
|
|||
print(' got wget response code {}:'.format(result.returncode))
|
||||
print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
|
||||
# raise Exception('Failed to wget download')
|
||||
chmod_file(link['domain'], cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
|
@ -140,7 +202,6 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT):
|
|||
if result.returncode:
|
||||
print(' ', (result.stderr or result.stdout).decode())
|
||||
raise Exception('Failed to print PDF')
|
||||
chmod_file('output.pdf', cwd=out_dir)
|
||||
output = 'output.pdf'
|
||||
except Exception as e:
|
||||
end()
|
||||
|
@ -338,67 +399,11 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
|||
# print(' √ Skipping video download')
|
||||
|
||||
|
||||
def archive_links(out_dir, links, export_path, resume=None):
|
||||
check_dependencies()
|
||||
|
||||
to_archive = links_after_timestamp(links, resume)
|
||||
try:
|
||||
for idx, link in enumerate(to_archive):
|
||||
out_dir = os.path.join(out_dir, link['timestamp'])
|
||||
archive_link(out_dir, link)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||
print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format(
|
||||
**ANSI,
|
||||
idx=idx,
|
||||
total=len(list(to_archive)),
|
||||
))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' ./archive.py {} {}'.format(
|
||||
export_path,
|
||||
link['timestamp'],
|
||||
))
|
||||
if not isinstance(e, KeyboardInterrupt):
|
||||
raise e
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
def archive_link(out_dir, link, overwrite=False, permissions=ARCHIVE_PERMISSIONS):
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
link = {**parse_json_link_index(out_dir), **link}
|
||||
log_link_archive(out_dir, link)
|
||||
|
||||
if FETCH_WGET:
|
||||
link = fetch_wget(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_PDF:
|
||||
link = fetch_pdf(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_SCREENSHOT:
|
||||
link = fetch_screenshot(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if SUBMIT_ARCHIVE_DOT_ORG:
|
||||
link = archive_dot_org(out_dir, link, overwrite=overwrite)
|
||||
|
||||
# if FETCH_AUDIO:
|
||||
# link = fetch_audio(out_dir, link, overwrite=overwrite)
|
||||
|
||||
# if FETCH_VIDEO:
|
||||
# link = fetch_video(out_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_FAVICON:
|
||||
link = fetch_favicon(out_dir, link, overwrite=overwrite)
|
||||
|
||||
write_link_index(out_dir, link)
|
||||
|
||||
return link
|
||||
|
||||
def log_link_archive(out_dir, link):
|
||||
update_existing = os.path.exists(out_dir)
|
||||
if not update_existing:
|
||||
os.makedirs(out_dir)
|
||||
run(['chmod', ARCHIVE_PERMISSIONS, out_dir], timeout=5)
|
||||
|
||||
print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format(
|
||||
symbol='*' if update_existing else '+',
|
||||
|
|
29
config.py
29
config.py
|
@ -4,14 +4,11 @@ import shutil
|
|||
|
||||
from subprocess import run, PIPE
|
||||
|
||||
# os.getenv('VARIABLE', 'DEFAULT') gets the value of environment
|
||||
# variable "VARIABLE" and if it is not set, sets it to 'DEFAULT'
|
||||
|
||||
# for boolean values, check to see if the string is 'true', and
|
||||
# if so, the python variable will be True
|
||||
|
||||
# *******************************************************************************
|
||||
# *** TO SET YOUR PREFERENCES, EDIT THE VALUES HERE, or use the 'env' command ***
|
||||
# ******************************************************************************
|
||||
# * TO SET YOUR CONFIGURATION, EDIT THE VALUES BELOW, or use the 'env' command *
|
||||
# * e.g. *
|
||||
# * env USE_COLOR=True CHROME_BINARY=google-chrome ./archive.py export.html *
|
||||
# ******************************************************************************
|
||||
|
||||
IS_TTY = sys.stdout.isatty()
|
||||
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
||||
|
@ -35,8 +32,16 @@ LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_ind
|
|||
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
|
||||
INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html')
|
||||
|
||||
# *******************************************************************************
|
||||
### Output Paths
|
||||
ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__))
|
||||
HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html')
|
||||
ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive')
|
||||
|
||||
# ******************************************************************************
|
||||
# ********************** Do not edit below this point **************************
|
||||
# ******************************************************************************
|
||||
|
||||
### Terminal Configuration
|
||||
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
|
||||
ANSI = {
|
||||
'reset': '\033[00;00m',
|
||||
|
@ -53,17 +58,13 @@ if not USE_COLOR:
|
|||
# dont show colors if USE_COLOR is False
|
||||
ANSI = {k: '' for k in ANSI.keys()}
|
||||
|
||||
|
||||
ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__))
|
||||
HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html')
|
||||
ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive')
|
||||
### Confirm Environment Setup
|
||||
try:
|
||||
GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode()
|
||||
except Exception:
|
||||
GIT_SHA = None
|
||||
print('[!] Warning, you need git installed for some archiving features to save correct version numbers!')
|
||||
|
||||
|
||||
if sys.stdout.encoding.upper() != 'UTF-8':
|
||||
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
|
||||
print(' To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)')
|
||||
|
|
Loading…
Reference in a new issue