make everything take link_dir as an optional arg since its derivable from link url

This commit is contained in:
Nick Sweeting 2019-03-27 18:24:30 -04:00
parent 9fc1e3c3e1
commit a214bd7c02
5 changed files with 65 additions and 59 deletions

View file

@ -180,7 +180,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk # Step 2: Write updated index with deduped old and new links back to disk
write_links_index(out_dir=OUTPUT_DIR, links=list(all_links)) write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
# Step 3: Run the archive methods for each link # Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links links = new_links if ONLY_NEW else all_links
@ -189,7 +189,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
link: Optional[Link] = None link: Optional[Link] = None
try: try:
for idx, link in enumerate(links_after_timestamp(links, resume)): for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link) archive_link(link, link_dir=link.link_dir)
except KeyboardInterrupt: except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0') log_archiving_paused(len(links), idx, link.timestamp if link else '0')
@ -203,7 +203,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR) all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(out_dir=OUTPUT_DIR, links=list(all_links), finished=True) write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links return all_links
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,6 +1,6 @@
import os import os
from typing import Dict, List, Tuple from typing import Dict, List, Tuple, Optional
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
@ -69,7 +69,7 @@ class ArchiveError(Exception):
@enforce_types @enforce_types
def archive_link(link: Link, page=None) -> Link: def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = ( ARCHIVE_METHODS = (
@ -84,13 +84,14 @@ def archive_link(link: Link, page=None) -> Link:
('archive_org', should_fetch_archive_dot_org, archive_dot_org), ('archive_org', should_fetch_archive_dot_org, archive_dot_org),
) )
link_dir = link_dir or link.link_dir
try: try:
is_new = not os.path.exists(link.link_dir) is_new = not os.path.exists(link_dir)
if is_new: if is_new:
os.makedirs(link.link_dir) os.makedirs(link_dir)
link = load_json_link_index(link.link_dir, link) link = load_json_link_index(link, link_dir)
log_link_archiving_started(link.link_dir, link, is_new) log_link_archiving_started(link, link_dir, is_new)
link = link.overwrite(updated=datetime.now()) link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
@ -99,10 +100,10 @@ def archive_link(link: Link, page=None) -> Link:
if method_name not in link.history: if method_name not in link.history:
link.history[method_name] = [] link.history[method_name] = []
if should_run(link.link_dir, link): if should_run(link, link_dir):
log_archive_method_started(method_name) log_archive_method_started(method_name)
result = method_function(link.link_dir, link) result = method_function(link, link_dir)
link.history[method_name].append(result) link.history[method_name].append(result)
@ -126,7 +127,7 @@ def archive_link(link: Link, page=None) -> Link:
patch_links_index(link) patch_links_index(link)
log_link_archiving_finished(link.link_dir, link, is_new, stats) log_link_archiving_finished(link, link.link_dir, is_new, stats)
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
@ -141,7 +142,7 @@ def archive_link(link: Link, page=None) -> Link:
### Archive Method Functions ### Archive Method Functions
@enforce_types @enforce_types
def should_fetch_title(link_dir: str, link: Link) -> bool: def should_fetch_title(link: Link, link_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it # if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'): if link.title and not link.title.lower().startswith('http'):
return False return False
@ -152,7 +153,7 @@ def should_fetch_title(link_dir: str, link: Link) -> bool:
return FETCH_TITLE return FETCH_TITLE
@enforce_types @enforce_types
def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
output = None output = None
@ -186,14 +187,14 @@ def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResul
@enforce_types @enforce_types
def should_fetch_favicon(link_dir: str, link: Link) -> bool: def should_fetch_favicon(link: Link, link_dir: Optional[str]=None) -> bool:
if os.path.exists(os.path.join(link_dir, 'favicon.ico')): if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return False return False
return FETCH_FAVICON return FETCH_FAVICON
@enforce_types @enforce_types
def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api""" """download site favicon from google's favicon api"""
output = 'favicon.ico' output = 'favicon.ico'
@ -226,7 +227,7 @@ def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveRes
) )
@enforce_types @enforce_types
def should_fetch_wget(link_dir: str, link: Link) -> bool: def should_fetch_wget(link: Link, link_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link) output_path = wget_output_path(link)
if output_path and os.path.exists(os.path.join(link_dir, output_path)): if output_path and os.path.exists(os.path.join(link_dir, output_path)):
return False return False
@ -235,7 +236,7 @@ def should_fetch_wget(link_dir: str, link: Link) -> bool:
@enforce_types @enforce_types
def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget""" """download full site using wget"""
if FETCH_WARC: if FETCH_WARC:
@ -315,7 +316,7 @@ def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult
) )
@enforce_types @enforce_types
def should_fetch_pdf(link_dir: str, link: Link) -> bool: def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -326,7 +327,7 @@ def should_fetch_pdf(link_dir: str, link: Link) -> bool:
@enforce_types @enforce_types
def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_pdf(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
output = 'output.pdf' output = 'output.pdf'
@ -361,7 +362,7 @@ def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
) )
@enforce_types @enforce_types
def should_fetch_screenshot(link_dir: str, link: Link) -> bool: def should_fetch_screenshot(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -371,7 +372,7 @@ def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
return FETCH_SCREENSHOT return FETCH_SCREENSHOT
@enforce_types @enforce_types
def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_screenshot(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
output = 'screenshot.png' output = 'screenshot.png'
@ -406,7 +407,7 @@ def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> Archive
) )
@enforce_types @enforce_types
def should_fetch_dom(link_dir: str, link: Link) -> bool: def should_fetch_dom(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -416,7 +417,7 @@ def should_fetch_dom(link_dir: str, link: Link) -> bool:
return FETCH_DOM return FETCH_DOM
@enforce_types @enforce_types
def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_dom(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
output = 'output.html' output = 'output.html'
@ -453,7 +454,7 @@ def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
) )
@enforce_types @enforce_types
def should_fetch_git(link_dir: str, link: Link) -> bool: def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -471,7 +472,7 @@ def should_fetch_git(link_dir: str, link: Link) -> bool:
@enforce_types @enforce_types
def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git""" """download full site using git"""
output = 'git' output = 'git'
@ -514,7 +515,7 @@ def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
@enforce_types @enforce_types
def should_fetch_media(link_dir: str, link: Link) -> bool: def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -524,7 +525,7 @@ def should_fetch_media(link_dir: str, link: Link) -> bool:
return FETCH_MEDIA return FETCH_MEDIA
@enforce_types @enforce_types
def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl""" """Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media' output = 'media'
@ -588,7 +589,7 @@ def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> Archiv
@enforce_types @enforce_types
def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool: def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -599,7 +600,7 @@ def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
return SUBMIT_ARCHIVE_DOT_ORG return SUBMIT_ARCHIVE_DOT_ORG
@enforce_types @enforce_types
def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url""" """submit site to archive.org for archiving via their service, save returned archive url"""
output = 'archive.org.txt' output = 'archive.org.txt'

View file

@ -39,23 +39,25 @@ from .logs import (
TITLE_LOADING_MSG = 'Not yet archived...' TITLE_LOADING_MSG = 'Not yet archived...'
### Homepage index for all the links ### Homepage index for all the links
@enforce_types @enforce_types
def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None: def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
log_indexing_process_started() log_indexing_process_started()
log_indexing_started(out_dir, 'index.json') log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ') timer = TimedProgress(TIMEOUT * 2, prefix=' ')
write_json_links_index(out_dir, links) write_json_links_index(links, out_dir=out_dir)
timer.end() timer.end()
log_indexing_finished(out_dir, 'index.json') log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html') log_indexing_started(out_dir, 'index.html')
timer = TimedProgress(TIMEOUT * 2, prefix=' ') timer = TimedProgress(TIMEOUT * 2, prefix=' ')
write_html_links_index(out_dir, links, finished=finished) write_html_links_index(links, out_dir=out_dir, finished=finished)
timer.end() timer.end()
log_indexing_finished(out_dir, 'index.html') log_indexing_finished(out_dir, 'index.html')
@ -87,7 +89,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -
@enforce_types @enforce_types
def write_json_links_index(out_dir: str, links: List[Link]) -> None: def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path""" """write the json link index to a given path"""
assert isinstance(links, List), 'Links must be a list, not a generator.' assert isinstance(links, List), 'Links must be a list, not a generator.'
@ -199,7 +201,6 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
successful = link.num_outputs successful = link.num_outputs
# Patch JSON index # Patch JSON index
changed = False
json_file_links = parse_json_links_index(out_dir) json_file_links = parse_json_links_index(out_dir)
patched_links = [] patched_links = []
for saved_link in json_file_links: for saved_link in json_file_links:
@ -212,7 +213,7 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
else: else:
patched_links.append(saved_link) patched_links.append(saved_link)
write_json_links_index(out_dir, patched_links) write_json_links_index(patched_links, out_dir=out_dir)
# Patch HTML index # Patch HTML index
html_path = os.path.join(out_dir, 'index.html') html_path = os.path.join(out_dir, 'index.html')
@ -231,27 +232,27 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
### Individual link index ### Individual link index
@enforce_types @enforce_types
def write_link_index(out_dir: str, link: Link) -> None: def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
write_json_link_index(out_dir, link) link_dir = link_dir or link.link_dir
write_html_link_index(out_dir, link)
write_json_link_index(link, link_dir)
write_html_link_index(link, link_dir)
@enforce_types @enforce_types
def write_json_link_index(out_dir: str, link: Link) -> None: def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link""" """write a json file with some info about the link"""
path = os.path.join(out_dir, 'index.json') link_dir = link_dir or link.link_dir
path = os.path.join(link_dir, 'index.json')
with open(path, 'w', encoding='utf-8') as f:
json.dump(link._asdict(), f, indent=4, cls=ExtendedEncoder)
chmod_file(path) chmod_file(path)
@enforce_types @enforce_types
def parse_json_link_index(out_dir: str) -> Optional[Link]: def parse_json_link_index(link_dir: str) -> Optional[Link]:
"""load the json link index from a given directory""" """load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json') existing_index = os.path.join(link_dir, 'index.json')
if os.path.exists(existing_index): if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f: with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f) link_json = json.load(f)
@ -260,18 +261,21 @@ def parse_json_link_index(out_dir: str) -> Optional[Link]:
@enforce_types @enforce_types
def load_json_link_index(out_dir: str, link: Link) -> Link: def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
"""check for an existing link archive in the given directory, """check for an existing link archive in the given directory,
and load+merge it into the given link dict and load+merge it into the given link dict
""" """
existing_link = parse_json_link_index(out_dir) link_dir = link_dir or link.link_dir
existing_link = parse_json_link_index(link_dir)
if existing_link: if existing_link:
return merge_links(existing_link, link) return merge_links(existing_link, link)
return link return link
@enforce_types @enforce_types
def write_html_link_index(out_dir: str, link: Link) -> None: def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read() link_html = f.read()

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass
from typing import Optional from typing import Optional
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult
from .config import ANSI, REPO_DIR, OUTPUT_DIR from .config import ANSI, OUTPUT_DIR
@dataclass @dataclass
@ -17,14 +17,14 @@ class RuntimeStats:
succeeded: int = 0 succeeded: int = 0
failed: int = 0 failed: int = 0
parse_start_ts: datetime = None parse_start_ts: Optional[datetime] = None
parse_end_ts: datetime = None parse_end_ts: Optional[datetime] = None
index_start_ts: datetime = None index_start_ts: Optional[datetime] = None
index_end_ts: datetime = None index_end_ts: Optional[datetime] = None
archiving_start_ts: datetime = None archiving_start_ts: Optional[datetime] = None
archiving_end_ts: datetime = None archiving_end_ts: Optional[datetime] = None
# globals are bad, mmkay # globals are bad, mmkay
_LAST_RUN_STATS = RuntimeStats() _LAST_RUN_STATS = RuntimeStats()
@ -131,7 +131,7 @@ def log_archiving_finished(num_links: int):
print(' {}/index.html'.format(OUTPUT_DIR)) print(' {}/index.html'.format(OUTPUT_DIR))
def log_link_archiving_started(link_dir: str, link: Link, is_new: bool): def log_link_archiving_started(link: Link, link_dir: str, is_new: bool):
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford" # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709 # > output/archive/1478739709
@ -149,7 +149,7 @@ def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
pretty_path(link_dir), pretty_path(link_dir),
)) ))
def log_link_archiving_finished(link_dir: str, link: Link, is_new: bool, stats: dict): def log_link_archiving_finished(link: Link, link_dir: str, is_new: bool, stats: dict):
total = sum(stats.values()) total = sum(stats.values())
if stats['failed'] > 0 : if stats['failed'] > 0 :

View file

@ -1,11 +1,12 @@
import os import os
import re import re
import sys import sys
import json
import time import time
import shutil import shutil
from json import JSONEncoder from json import JSONEncoder
from typing import List, Optional, Any from typing import List, Optional, Any, Union
from inspect import signature, _empty from inspect import signature, _empty
from functools import wraps from functools import wraps
from hashlib import sha256 from hashlib import sha256