better loading and saving storage mechanism

This commit is contained in:
Nick Sweeting 2019-04-17 02:25:28 -04:00
parent c95f893b61
commit 9ce47431da
14 changed files with 395 additions and 238 deletions

View file

@ -6,9 +6,9 @@ from datetime import datetime
from .schema import Link, ArchiveResult, ArchiveOutput from .schema import Link, ArchiveResult, ArchiveOutput
from .index import ( from .index import (
write_link_index, load_link_details,
patch_links_index, write_link_details,
load_json_link_index, patch_main_index,
) )
from .config import ( from .config import (
CURL_BINARY, CURL_BINARY,

View file

@ -115,7 +115,6 @@ URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST el
VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip() VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[-1] or 'unknown' GIT_SHA = VERSION.split('+')[-1] or 'unknown'
HAS_INVALID_DEPENDENCIES = False HAS_INVALID_DEPENDENCIES = False
HAS_INVALID_DB = not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
### Check system environment ### Check system environment
if USER == 'root': if USER == 'root':
@ -429,13 +428,12 @@ def check_dependencies() -> None:
raise SystemExit(1) raise SystemExit(1)
def check_data_folder() -> None: def check_data_folder() -> None:
if HAS_INVALID_DB: if not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')):
stderr('{red}[X] No archive data found in:{reset} {}'.format(OUTPUT_DIR, **ANSI)) stderr('{red}[X] No archive data was found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
stderr(' Are you running archivebox in the right folder?') stderr(' Are you running archivebox in the right folder?')
stderr(' cd path/to/your/archive') stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]') stderr(' archivebox [command]')
stderr() stderr()
stderr(' To create a new archive folder, run:') stderr(' To create a new archive collection in this folder, run:')
stderr(' mkdir new_archive_dir && cd new_archive_dir')
stderr(' archivebox init') stderr(' archivebox init')
raise SystemExit(1) raise SystemExit(1)

View file

@ -1,33 +1,28 @@
import os import os
import json import json
from datetime import datetime from typing import List, Tuple, Optional, Iterable
from string import Template
from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
from collections import OrderedDict from collections import OrderedDict
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult
from .config import ( from .config import (
OUTPUT_DIR, OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
TIMEOUT, TIMEOUT,
URL_BLACKLIST_PTN, URL_BLACKLIST_PTN,
ANSI, ANSI,
stderr, stderr,
) )
from .storage.html import write_html_main_index, write_html_link_details
from .storage.json import (
parse_json_main_index,
write_json_main_index,
parse_json_link_details,
write_json_link_details,
)
from .util import ( from .util import (
scheme, scheme,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
enforce_types, enforce_types,
TimedProgress, TimedProgress,
copy_and_overwrite,
atomic_write, atomic_write,
ExtendedEncoder, ExtendedEncoder,
) )
@ -40,8 +35,6 @@ from .logs import (
log_parsing_finished, log_parsing_finished,
) )
TITLE_LOADING_MSG = 'Not yet archived...'
### Link filtering and checking ### Link filtering and checking
@ -53,8 +46,10 @@ def merge_links(a: Link, b: Link) -> Link:
""" """
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs' assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
# longest url wins (because a fuzzy url will always be shorter)
url = a.url if len(a.url) > len(b.url) else b.url url = a.url if len(a.url) > len(b.url) else b.url
# best title based on length and quality
possible_titles = [ possible_titles = [
title title
for title in (a.title, b.title) for title in (a.title, b.title)
@ -66,20 +61,24 @@ def merge_links(a: Link, b: Link) -> Link:
elif len(possible_titles) == 1: elif len(possible_titles) == 1:
title = possible_titles[0] title = possible_titles[0]
# earliest valid timestamp
timestamp = ( timestamp = (
a.timestamp a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp b.timestamp
) )
# all unique, truthy tags
tags_set = ( tags_set = (
set(tag.strip() for tag in (a.tags or '').split(',')) set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(',')) | set(tag.strip() for tag in (b.tags or '').split(','))
) )
tags = ','.join(tags_set) or None tags = ','.join(tags_set) or None
# all unique source entries
sources = list(set(a.sources + b.sources)) sources = list(set(a.sources + b.sources))
# all unique history entries for the combined archive methods
all_methods = set(list(a.history.keys()) + list(a.history.keys())) all_methods = set(list(a.history.keys()) + list(a.history.keys()))
history = { history = {
method: (a.history.get(method) or []) + (b.history.get(method) or []) method: (a.history.get(method) or []) + (b.history.get(method) or [])
@ -95,7 +94,6 @@ def merge_links(a: Link, b: Link) -> Link:
key=lambda result: result.start_ts, key=lambda result: result.start_ts,
))) )))
return Link( return Link(
url=url, url=url,
timestamp=timestamp, timestamp=timestamp,
@ -105,6 +103,8 @@ def merge_links(a: Link, b: Link) -> Link:
history=history, history=history,
) )
@enforce_types
def validate_links(links: Iterable[Link]) -> Iterable[Link]: def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url links = sorted_links(links) # deterministically sort the links based on timstamp, url
@ -121,6 +121,8 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
return links return links
@enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
for link in links: for link in links:
@ -130,6 +132,7 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
yield link yield link
@enforce_types
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
""" """
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
@ -153,12 +156,14 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
return unique_timestamps.values() return unique_timestamps.values()
@enforce_types
def sorted_links(links: Iterable[Link]) -> Iterable[Link]: def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True) return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]: @enforce_types
def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
if not resume: if not resume:
yield from links yield from links
return return
@ -171,6 +176,7 @@ def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable
print('Resume value and all timestamp values must be valid numbers.') print('Resume value and all timestamp values must be valid numbers.')
@enforce_types
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
@ -190,10 +196,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
### Homepage index for all the links ### Main Links Index
@enforce_types @enforce_types
def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
log_indexing_process_started() log_indexing_process_started()
@ -201,7 +207,7 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
log_indexing_started(out_dir, 'index.json') log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ') timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try: try:
write_json_links_index(links, out_dir=out_dir) write_json_main_index(links, out_dir=out_dir)
finally: finally:
timer.end() timer.end()
log_indexing_finished(out_dir, 'index.json') log_indexing_finished(out_dir, 'index.json')
@ -209,19 +215,19 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
log_indexing_started(out_dir, 'index.html') log_indexing_started(out_dir, 'index.html')
timer = TimedProgress(TIMEOUT * 2, prefix=' ') timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try: try:
write_html_links_index(links, out_dir=out_dir, finished=finished) write_html_main_index(links, out_dir=out_dir, finished=finished)
finally: finally:
timer.end() timer.end()
log_indexing_finished(out_dir, 'index.html') log_indexing_finished(out_dir, 'index.html')
@enforce_types @enforce_types
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]: def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
existing_links: List[Link] = [] existing_links: List[Link] = []
if out_dir: if out_dir:
existing_links = list(parse_json_links_index(out_dir)) existing_links = list(parse_json_main_index(out_dir))
new_links: List[Link] = [] new_links: List[Link] = []
if import_path: if import_path:
@ -242,108 +248,16 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -
@enforce_types @enforce_types
def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path""" """hack to in-place update one row's info in the generated index files"""
assert isinstance(links, List), 'Links must be a list, not a generator.' # TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
assert not links or isinstance(links[0].history, dict)
assert not links or isinstance(links[0].sources, list)
if links and links[0].history.get('title'): title = link.title or link.latest_outputs(status='succeeded')['title']
assert isinstance(links[0].history['title'][0], ArchiveResult)
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
atomic_write(index_json, path)
@enforce_types
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
return ()
@enforce_types
def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(
os.path.join(TEMPLATES_DIR, 'static'),
os.path.join(out_dir, 'static'),
)
atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
index_html = f.read()
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
link_row_html = f.read()
link_rows = []
for link in links:
template_row_vars: Mapping[str, str] = {
**derived_link_info(link),
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
'favicon_url': (
os.path.join('archive', link.timestamp, 'favicon.ico')
# if link['is_archived'] else ''
),
'archive_url': urlencode(
wget_output_path(link) or 'index.html'
),
}
link_rows.append(Template(link_row_html).substitute(**template_row_vars))
template_vars: Mapping[str, str] = {
'num_links': str(len(links)),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO,
'version': VERSION,
'git_sha': GIT_SHA,
'rows': '\n'.join(link_rows),
'status': 'finished' if finished else 'running',
}
template_html = Template(index_html).substitute(**template_vars)
atomic_write(template_html, os.path.join(out_dir, 'index.html'))
@enforce_types
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index html"""
title = link.title or link.latest_outputs()['title']
successful = link.num_outputs successful = link.num_outputs
# Patch JSON index # Patch JSON main index
json_file_links = parse_json_links_index(out_dir) json_file_links = parse_json_main_index(out_dir)
patched_links = [] patched_links = []
for saved_link in json_file_links: for saved_link in json_file_links:
if saved_link.url == link.url: if saved_link.url == link.url:
@ -355,11 +269,12 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
else: else:
patched_links.append(saved_link) patched_links.append(saved_link)
write_json_links_index(patched_links, out_dir=out_dir) write_json_main_index(patched_links, out_dir=out_dir)
# Patch HTML index # Patch HTML main index
html_path = os.path.join(out_dir, 'index.html') html_path = os.path.join(out_dir, 'index.html')
html = open(html_path, 'r').read().split('\n') with open(html_path, 'r') as f:
html = f.read().split('\n')
for idx, line in enumerate(html): for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link.url) in line): if title and ('<span data-title-for="{}"'.format(link.url) in line):
html[idx] = '<span>{}</span>'.format(title) html[idx] = '<span>{}</span>'.format(title)
@ -370,76 +285,31 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
atomic_write('\n'.join(html), html_path) atomic_write('\n'.join(html), html_path)
### Individual link index ### Link Details Index
@enforce_types @enforce_types
def write_link_index(link: Link, link_dir: Optional[str]=None) -> None: def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir out_dir = out_dir or link.link_dir
write_json_link_index(link, link_dir) write_json_link_details(link, out_dir=out_dir)
write_html_link_index(link, link_dir) write_html_link_details(link, out_dir=out_dir)
@enforce_types @enforce_types
def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None: def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
"""write a json file with some info about the link"""
link_dir = link_dir or link.link_dir
path = os.path.join(link_dir, 'index.json')
atomic_write(link._asdict(), path)
@enforce_types
def parse_json_link_index(link_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(link_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
return Link.from_json(link_json)
return None
@enforce_types
def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
"""check for an existing link archive in the given directory, """check for an existing link archive in the given directory,
and load+merge it into the given link dict and load+merge it into the given link dict
""" """
link_dir = link_dir or link.link_dir out_dir = out_dir or link.link_dir
existing_link = parse_json_link_index(link_dir)
existing_link = parse_json_link_details(out_dir)
if existing_link: if existing_link:
return merge_links(existing_link, link) return merge_links(existing_link, link)
return link return link
@enforce_types
def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read()
path = os.path.join(link_dir, 'index.html')
template_vars: Mapping[str, str] = {
**derived_link_info(link),
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'url_str': htmlencode(urldecode(link.base_url)),
'archive_url': urlencode(
wget_output_path(link)
or (link.domain if link.is_archived else 'about:blank')
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
}
html_index = Template(link_html).substitute(**template_vars)
atomic_write(html_index, path)

View file

@ -1,3 +1,4 @@
import os
import re import re
import shutil import shutil
@ -7,13 +8,18 @@ from .schema import Link
from .util import enforce_types, TimedProgress from .util import enforce_types, TimedProgress
from .index import ( from .index import (
links_after_timestamp, links_after_timestamp,
load_links_index, load_main_index,
write_links_index, write_main_index,
) )
from .archive_methods import archive_link from .archive_methods import archive_link
from .config import ( from .config import (
stderr,
ANSI,
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR, OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
check_dependencies, check_dependencies,
check_data_folder, check_data_folder,
) )
@ -28,6 +34,51 @@ from .logs import (
) )
@enforce_types
def init():
os.makedirs(OUTPUT_DIR, exist_ok=True)
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if not is_empty:
if existing_index:
stderr('{green}[√] You already have an archive index in: {}{reset}'.format(OUTPUT_DIR, **ANSI))
stderr(' To add new links, you can run:')
stderr(" archivebox add 'https://example.com'")
stderr()
stderr(' For more usage and examples, run:')
stderr(' archivebox help')
# TODO: import old archivebox version's archive data folder
raise SystemExit(1)
else:
stderr(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
os.makedirs(SOURCES_DIR)
stderr(f' > {SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR)
stderr(f' > {ARCHIVE_DIR}')
os.makedirs(DATABASE_DIR)
stderr(f' > {DATABASE_DIR}')
write_main_index([], out_dir=OUTPUT_DIR, finished=True)
stderr('{green}[√] Done.{reset}'.format(**ANSI))
@enforce_types @enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]: def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here.""" """The main ArchiveBox entrancepoint. Everything starts here."""
@ -37,19 +88,19 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 1: Load list of links from the existing index # Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path # merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk # Step 2: Write updated index with deduped old and new links back to disk
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR) write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
# Step 3: Run the archive methods for each link # Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume) log_archiving_started(len(links), resume)
idx: int = 0 idx: int = 0
link: Optional[Link] = None link: Link = None # type: ignore
try: try:
for idx, link in enumerate(links_after_timestamp(links, resume)): for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link, link_dir=link.link_dir) archive_link(link, out_dir=link.link_dir)
except KeyboardInterrupt: except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0') log_archiving_paused(len(links), idx, link.timestamp if link else '0')
@ -62,8 +113,8 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
log_archiving_finished(len(links)) log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR) all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links return all_links
@ -87,7 +138,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact', def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]: after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_links_index(out_dir=OUTPUT_DIR) all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links: for link in all_links:
if after is not None and float(link.timestamp) < after: if after is not None and float(link.timestamp) < after:
@ -133,7 +184,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
timer = TimedProgress(360, prefix=' ') timer = TimedProgress(360, prefix=' ')
try: try:
to_keep = [] to_keep = []
all_links, _ = load_links_index(out_dir=OUTPUT_DIR) all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links: for link in all_links:
should_remove = ( should_remove = (
(after is not None and float(link.timestamp) < after) (after is not None and float(link.timestamp) < after)
@ -147,7 +198,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
finally: finally:
timer.end() timer.end()
write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True) write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
log_removal_finished(len(all_links), len(to_keep)) log_removal_finished(len(all_links), len(to_keep))
return to_keep return to_keep

View file

@ -112,20 +112,25 @@ class Link:
return float(self.timestamp) > float(other.timestamp) return float(self.timestamp) > float(other.timestamp)
def typecheck(self) -> None: def typecheck(self) -> None:
assert self.schema == self.__class__.__name__ from .config import stderr, ANSI
assert isinstance(self.timestamp, str) and self.timestamp try:
assert self.timestamp.replace('.', '').isdigit() assert self.schema == self.__class__.__name__
assert isinstance(self.url, str) and '://' in self.url assert isinstance(self.timestamp, str) and self.timestamp
assert self.updated is None or isinstance(self.updated, datetime) assert self.timestamp.replace('.', '').isdigit()
assert self.title is None or isinstance(self.title, str) and self.title assert isinstance(self.url, str) and '://' in self.url
assert self.tags is None or isinstance(self.tags, str) and self.tags assert self.updated is None or isinstance(self.updated, datetime)
assert isinstance(self.sources, list) assert self.title is None or (isinstance(self.title, str) and self.title)
assert all(isinstance(source, str) and source for source in self.sources) assert self.tags is None or (isinstance(self.tags, str) and self.tags)
assert isinstance(self.history, dict) assert isinstance(self.sources, list)
for method, results in self.history.items(): assert all(isinstance(source, str) and source for source in self.sources)
assert isinstance(method, str) and method assert isinstance(self.history, dict)
assert isinstance(results, list) for method, results in self.history.items():
assert all(isinstance(result, ArchiveResult) for result in results) assert isinstance(method, str) and method
assert isinstance(results, list)
assert all(isinstance(result, ArchiveResult) for result in results)
except Exception:
stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
raise
def _asdict(self, extended=False): def _asdict(self, extended=False):
info = { info = {

View file

@ -0,0 +1 @@
__package__ = 'archivebox.legacy.storage'

View file

@ -0,0 +1,126 @@
import os
from datetime import datetime
from typing import List, Optional
from ..schema import Link
from ..config import (
OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
ARCHIVE_DIR_NAME,
)
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
render_template,
atomic_write,
copy_and_overwrite,
)
join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
TITLE_LOADING_MSG = 'Not yet archived...'
### Main Links Index
@enforce_types
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(join(TEMPLATES_DIR, 'favicon.ico'), join(out_dir, 'favicon.ico'))
copy_and_overwrite(join(TEMPLATES_DIR, 'robots.txt'), join(out_dir, 'robots.txt'))
copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
rendered_html = main_index_template(links, finished=finished)
atomic_write(rendered_html, join(out_dir, 'index.html'))
@enforce_types
def main_index_template(links: List[Link], finished: bool=True) -> str:
"""render the template for the entire main index"""
return render_template(MAIN_INDEX_TEMPLATE, {
'version': VERSION,
'git_sha': GIT_SHA,
'num_links': str(len(links)),
'status': 'finished' if finished else 'running',
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'rows': '\n'.join(
main_index_row_template(link)
for link in links
),
'footer_info': FOOTER_INFO,
})
@enforce_types
def main_index_row_template(link: Link) -> str:
"""render the template for an individual link row of the main index"""
return render_template(MAIN_INDEX_ROW_TEMPLATE, {
**link._asdict(extended=True),
# before pages are finished archiving, show loading msg instead of title
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
# before pages are finished archiving, show fallback loading favicon
'favicon_url': (
join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
# if link['is_archived'] else ''
),
# before pages are finished archiving, show the details page instead
'wget_url': urlencode(wget_output_path(link) or 'index.html'),
# replace commas in tags with spaces, or file extension if it's static
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
})
### Link Details Index
@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
atomic_write(rendered_html, join(out_dir, 'index.html'))
@enforce_types
def link_details_template(link: Link) -> str:
link_info = link._asdict(extended=True)
return render_template(LINK_DETAILS_TEMPLATE, {
**link_info,
**link_info['canonical'],
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'url_str': htmlencode(urldecode(link.base_url)),
'archive_url': urlencode(
wget_output_path(link)
or (link.domain if link.is_archived else 'about:blank')
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
})

View file

@ -0,0 +1,81 @@
import os
import json
from datetime import datetime
from typing import List, Optional, Iterator
from ..schema import Link, ArchiveResult
from ..config import (
VERSION,
OUTPUT_DIR,
)
from ..util import (
enforce_types,
atomic_write,
)
### Main Links Index
@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
return ()
@enforce_types
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path"""
assert isinstance(links, List), 'Links must be a list, not a generator.'
assert not links or isinstance(links[0].history, dict)
assert not links or isinstance(links[0].sources, list)
if links and links[0].history.get('title'):
assert isinstance(links[0].history['title'][0], ArchiveResult)
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
atomic_write(index_json, path)
### Link Details Index
@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, 'index.json')
atomic_write(link._asdict(extended=True), path)
@enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
return Link.from_json(link_json)
return None

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View file

@ -246,7 +246,7 @@
</a> </a>
</div> </div>
<div class="col-lg-8"> <div class="col-lg-8">
<img src="$link_dir/$favicon_url" alt="Favicon"> <img src="$link_dir/favicon.ico" alt="Favicon">
&nbsp;&nbsp; &nbsp;&nbsp;
$title $title
&nbsp;&nbsp; &nbsp;&nbsp;
@ -325,36 +325,36 @@
</div> </div>
<div class="col-lg-2"> <div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="$dom_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="$dom_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$dom_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$dom_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$dom_url" target="preview"><h4 class="card-title">HTML</h4></a> <a href="$dom_path" target="preview"><h4 class="card-title">HTML</h4></a>
<p class="card-text">archive/output.html</p> <p class="card-text">archive/output.html</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top pdf-frame" src="$pdf_url" scrolling="no"></iframe> <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$pdf_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$pdf_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$pdf_url" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a> <a href="$pdf_path" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
<p class="card-text">archive/output.pdf</p> <p class="card-text">archive/output.pdf</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-2">
<div class="card"> <div class="card">
<img class="card-img-top screenshot" src="$screenshot_url"></iframe> <img class="card-img-top screenshot" src="$screenshot_path"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$screenshot_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$screenshot_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$screenshot_url" target="preview"><h4 class="card-title">Screenshot</h4></a> <a href="$screenshot_path" target="preview"><h4 class="card-title">Screenshot</h4></a>
<p class="card-text">archive/screenshot.png</p> <p class="card-text">archive/screenshot.png</p>
</div> </div>
</div> </div>
@ -373,12 +373,12 @@
</div> </div>
<div class="col-lg-2"> <div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="$archive_org_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="$archive_org_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$archive_org_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$archive_org_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$archive_org_url" target="preview"><h4 class="card-title">Archive.Org</h4></a> <a href="$archive_org_path" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<p class="card-text">web.archive.org/web/...</p> <p class="card-text">web.archive.org/web/...</p>
</div> </div>
</div> </div>

View file

@ -1,14 +1,14 @@
<tr> <tr>
<td title="$timestamp">$bookmarked_date</td> <td title="$timestamp">$bookmarked_date</td>
<td class="title-col"> <td class="title-col">
<a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a> <a href="$archive_path/index.html"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
<a href="$archive_path/$wget_url" title="$title"> <a href="$archive_path/$wget_url" title="$title">
<span data-title-for="$url" data-archived="$is_archived">$title</span> <span data-title-for="$url" data-archived="$is_archived">$title</span>
<small style="float:right">$tags</small> <small style="float:right">$tags</small>
</a> </a>
</td> </td>
<td> <td>
<a href="$archive_path/$index_url">📄 <a href="$archive_path/index.html">📄
<span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner" decoding="async"/></span> <span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner" decoding="async"/></span>
</a> </a>
</td> </td>

View file

@ -0,0 +1,2 @@
User-agent: *
Disallow: /

View file

@ -5,8 +5,9 @@ import json
import time import time
import shutil import shutil
from string import Template
from json import JSONEncoder from json import JSONEncoder
from typing import List, Optional, Any, Union, IO from typing import List, Optional, Any, Union, IO, Mapping
from inspect import signature from inspect import signature
from functools import wraps from functools import wraps
from hashlib import sha256 from hashlib import sha256
@ -396,10 +397,11 @@ def parse_date(date: Any) -> Optional[datetime]:
try: try:
return datetime.fromisoformat(date) return datetime.fromisoformat(date)
except Exception: except Exception:
try: pass
return datetime.strptime(date, '%Y-%m-%d %H:%M') try:
except Exception: return datetime.strptime(date, '%Y-%m-%d %H:%M')
pass except Exception:
pass
raise ValueError('Tried to parse invalid date! {}'.format(date)) raise ValueError('Tried to parse invalid date! {}'.format(date))
@ -552,9 +554,12 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
@enforce_types @enforce_types
def copy_and_overwrite(from_path: str, to_path: str): def copy_and_overwrite(from_path: str, to_path: str):
if os.path.exists(to_path): if os.path.isdir(from_path):
shutil.rmtree(to_path) shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path) shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
@enforce_types @enforce_types
def chrome_args(**options) -> List[str]: def chrome_args(**options) -> List[str]:
@ -642,11 +647,27 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
return '\n'.join((header_str, *row_strs)) return '\n'.join((header_str, *row_strs))
def atomic_write(contents: Union[dict, str], path: str) -> None: @enforce_types
def render_template(template_path: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
# will be replaced by django templates in the future
with open(template_path, 'r', encoding='utf-8') as template:
template_str = template.read()
return Template(template_str).substitute(**context)
def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename""" """Safe atomic write to filesystem by writing to temp file + atomic rename"""
try: try:
tmp_file = '{}.tmp'.format(path) tmp_file = '{}.tmp'.format(path)
with open(tmp_file, 'w+', encoding='utf-8') as f:
if isinstance(contents, bytes):
args = {'mode': 'wb+'}
else:
args = {'mode': 'w+', 'encoding': 'utf-8'}
with open(tmp_file, **args) as f:
if isinstance(contents, dict): if isinstance(contents, dict):
to_json(contents, file=f) to_json(contents, file=f)
else: else:
@ -678,3 +699,5 @@ def reject_stdin(caller: str) -> None:
)) ))
print() print()
raise SystemExit(1) raise SystemExit(1)