better loading and saving storage mechanism

This commit is contained in:
Nick Sweeting 2019-04-17 02:25:28 -04:00
parent c95f893b61
commit 9ce47431da
14 changed files with 395 additions and 238 deletions

View file

@ -6,9 +6,9 @@ from datetime import datetime
from .schema import Link, ArchiveResult, ArchiveOutput
from .index import (
write_link_index,
patch_links_index,
load_json_link_index,
load_link_details,
write_link_details,
patch_main_index,
)
from .config import (
CURL_BINARY,

View file

@ -115,7 +115,6 @@ URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST el
VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[-1] or 'unknown'
HAS_INVALID_DEPENDENCIES = False
HAS_INVALID_DB = not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
### Check system environment
if USER == 'root':
@ -429,13 +428,12 @@ def check_dependencies() -> None:
raise SystemExit(1)
def check_data_folder() -> None:
if HAS_INVALID_DB:
stderr('{red}[X] No archive data found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
if not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')):
stderr('{red}[X] No archive data was found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
stderr(' Are you running archivebox in the right folder?')
stderr(' cd path/to/your/archive')
stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]')
stderr()
stderr(' To create a new archive folder, run:')
stderr(' mkdir new_archive_dir && cd new_archive_dir')
stderr(' To create a new archive collection in this folder, run:')
stderr(' archivebox init')
raise SystemExit(1)

View file

@ -1,33 +1,28 @@
import os
import json
from datetime import datetime
from string import Template
from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
from typing import List, Tuple, Optional, Iterable
from collections import OrderedDict
from .schema import Link, ArchiveResult
from .config import (
OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
TIMEOUT,
URL_BLACKLIST_PTN,
ANSI,
stderr,
)
from .storage.html import write_html_main_index, write_html_link_details
from .storage.json import (
parse_json_main_index,
write_json_main_index,
parse_json_link_details,
write_json_link_details,
)
from .util import (
scheme,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
enforce_types,
TimedProgress,
copy_and_overwrite,
atomic_write,
ExtendedEncoder,
)
@ -40,8 +35,6 @@ from .logs import (
log_parsing_finished,
)
TITLE_LOADING_MSG = 'Not yet archived...'
### Link filtering and checking
@ -53,8 +46,10 @@ def merge_links(a: Link, b: Link) -> Link:
"""
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
# longest url wins (because a fuzzy url will always be shorter)
url = a.url if len(a.url) > len(b.url) else b.url
# best title based on length and quality
possible_titles = [
title
for title in (a.title, b.title)
@ -66,20 +61,24 @@ def merge_links(a: Link, b: Link) -> Link:
elif len(possible_titles) == 1:
title = possible_titles[0]
# earliest valid timestamp
timestamp = (
a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp
)
# all unique, truthy tags
tags_set = (
set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(','))
)
tags = ','.join(tags_set) or None
# all unique source entries
sources = list(set(a.sources + b.sources))
# all unique history entries for the combined archive methods
all_methods = set(list(a.history.keys()) + list(a.history.keys()))
history = {
method: (a.history.get(method) or []) + (b.history.get(method) or [])
@ -95,7 +94,6 @@ def merge_links(a: Link, b: Link) -> Link:
key=lambda result: result.start_ts,
)))
return Link(
url=url,
timestamp=timestamp,
@ -105,6 +103,8 @@ def merge_links(a: Link, b: Link) -> Link:
history=history,
)
@enforce_types
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
@ -121,6 +121,8 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
return links
@enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
@ -130,6 +132,7 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
yield link
@enforce_types
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
"""
ensures that all non-duplicate links have monotonically increasing timestamps
@ -153,12 +156,14 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
return unique_timestamps.values()
@enforce_types
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
@enforce_types
def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
if not resume:
yield from links
return
@ -171,6 +176,7 @@ def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable
print('Resume value and all timestamp values must be valid numbers.')
@enforce_types
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
@ -190,10 +196,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
### Homepage index for all the links
### Main Links Index
@enforce_types
def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links"""
log_indexing_process_started()
@ -201,7 +207,7 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_json_links_index(links, out_dir=out_dir)
write_json_main_index(links, out_dir=out_dir)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.json')
@ -209,19 +215,19 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
log_indexing_started(out_dir, 'index.html')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_html_links_index(links, out_dir=out_dir, finished=finished)
write_html_main_index(links, out_dir=out_dir, finished=finished)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.html')
@enforce_types
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
"""parse and load existing index with any new links from import_path merged in"""
existing_links: List[Link] = []
if out_dir:
existing_links = list(parse_json_links_index(out_dir))
existing_links = list(parse_json_main_index(out_dir))
new_links: List[Link] = []
if import_path:
@ -242,108 +248,16 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -
@enforce_types
def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path"""
def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index files"""
assert isinstance(links, List), 'Links must be a list, not a generator.'
assert not links or isinstance(links[0].history, dict)
assert not links or isinstance(links[0].sources, list)
# TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
if links and links[0].history.get('title'):
assert isinstance(links[0].history['title'][0], ArchiveResult)
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
atomic_write(index_json, path)
@enforce_types
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
return ()
@enforce_types
def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(
os.path.join(TEMPLATES_DIR, 'static'),
os.path.join(out_dir, 'static'),
)
atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
index_html = f.read()
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
link_row_html = f.read()
link_rows = []
for link in links:
template_row_vars: Mapping[str, str] = {
**derived_link_info(link),
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
'favicon_url': (
os.path.join('archive', link.timestamp, 'favicon.ico')
# if link['is_archived'] else ''
),
'archive_url': urlencode(
wget_output_path(link) or 'index.html'
),
}
link_rows.append(Template(link_row_html).substitute(**template_row_vars))
template_vars: Mapping[str, str] = {
'num_links': str(len(links)),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO,
'version': VERSION,
'git_sha': GIT_SHA,
'rows': '\n'.join(link_rows),
'status': 'finished' if finished else 'running',
}
template_html = Template(index_html).substitute(**template_vars)
atomic_write(template_html, os.path.join(out_dir, 'index.html'))
@enforce_types
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index html"""
title = link.title or link.latest_outputs()['title']
title = link.title or link.latest_outputs(status='succeeded')['title']
successful = link.num_outputs
# Patch JSON index
json_file_links = parse_json_links_index(out_dir)
# Patch JSON main index
json_file_links = parse_json_main_index(out_dir)
patched_links = []
for saved_link in json_file_links:
if saved_link.url == link.url:
@ -355,11 +269,12 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
else:
patched_links.append(saved_link)
write_json_links_index(patched_links, out_dir=out_dir)
write_json_main_index(patched_links, out_dir=out_dir)
# Patch HTML index
# Patch HTML main index
html_path = os.path.join(out_dir, 'index.html')
html = open(html_path, 'r').read().split('\n')
with open(html_path, 'r') as f:
html = f.read().split('\n')
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link.url) in line):
html[idx] = '<span>{}</span>'.format(title)
@ -370,76 +285,31 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
atomic_write('\n'.join(html), html_path)
### Individual link index
### Link Details Index
@enforce_types
def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
write_json_link_index(link, link_dir)
write_html_link_index(link, link_dir)
write_json_link_details(link, out_dir=out_dir)
write_html_link_details(link, out_dir=out_dir)
@enforce_types
def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
link_dir = link_dir or link.link_dir
path = os.path.join(link_dir, 'index.json')
atomic_write(link._asdict(), path)
@enforce_types
def parse_json_link_index(link_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(link_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
return Link.from_json(link_json)
return None
@enforce_types
def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
link_dir = link_dir or link.link_dir
existing_link = parse_json_link_index(link_dir)
out_dir = out_dir or link.link_dir
existing_link = parse_json_link_details(out_dir)
if existing_link:
return merge_links(existing_link, link)
return link
@enforce_types
def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read()
path = os.path.join(link_dir, 'index.html')
template_vars: Mapping[str, str] = {
**derived_link_info(link),
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'url_str': htmlencode(urldecode(link.base_url)),
'archive_url': urlencode(
wget_output_path(link)
or (link.domain if link.is_archived else 'about:blank')
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
}
html_index = Template(link_html).substitute(**template_vars)
atomic_write(html_index, path)

View file

@ -1,3 +1,4 @@
import os
import re
import shutil
@ -7,13 +8,18 @@ from .schema import Link
from .util import enforce_types, TimedProgress
from .index import (
links_after_timestamp,
load_links_index,
write_links_index,
load_main_index,
write_main_index,
)
from .archive_methods import archive_link
from .config import (
stderr,
ANSI,
ONLY_NEW,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
check_dependencies,
check_data_folder,
)
@ -28,6 +34,51 @@ from .logs import (
)
@enforce_types
def init():
os.makedirs(OUTPUT_DIR, exist_ok=True)
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if not is_empty:
if existing_index:
stderr('{green}[√] You already have an archive index in: {}{reset}'.format(OUTPUT_DIR, **ANSI))
stderr(' To add new links, you can run:')
stderr(" archivebox add 'https://example.com'")
stderr()
stderr(' For more usage and examples, run:')
stderr(' archivebox help')
# TODO: import old archivebox version's archive data folder
raise SystemExit(1)
else:
stderr(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
os.makedirs(SOURCES_DIR)
stderr(f' > {SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR)
stderr(f' > {ARCHIVE_DIR}')
os.makedirs(DATABASE_DIR)
stderr(f' > {DATABASE_DIR}')
write_main_index([], out_dir=OUTPUT_DIR, finished=True)
stderr('{green}[√] Done.{reset}'.format(**ANSI))
@enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here."""
@ -37,19 +88,19 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume)
idx: int = 0
link: Optional[Link] = None
link: Link = None # type: ignore
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link, link_dir=link.link_dir)
archive_link(link, out_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
@ -62,8 +113,8 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
@ -87,7 +138,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
if after is not None and float(link.timestamp) < after:
@ -133,7 +184,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
timer = TimedProgress(360, prefix=' ')
try:
to_keep = []
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)
@ -147,7 +198,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
finally:
timer.end()
write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
log_removal_finished(len(all_links), len(to_keep))
return to_keep

View file

@ -112,13 +112,15 @@ class Link:
return float(self.timestamp) > float(other.timestamp)
def typecheck(self) -> None:
from .config import stderr, ANSI
try:
assert self.schema == self.__class__.__name__
assert isinstance(self.timestamp, str) and self.timestamp
assert self.timestamp.replace('.', '').isdigit()
assert isinstance(self.url, str) and '://' in self.url
assert self.updated is None or isinstance(self.updated, datetime)
assert self.title is None or isinstance(self.title, str) and self.title
assert self.tags is None or isinstance(self.tags, str) and self.tags
assert self.title is None or (isinstance(self.title, str) and self.title)
assert self.tags is None or (isinstance(self.tags, str) and self.tags)
assert isinstance(self.sources, list)
assert all(isinstance(source, str) and source for source in self.sources)
assert isinstance(self.history, dict)
@ -126,6 +128,9 @@ class Link:
assert isinstance(method, str) and method
assert isinstance(results, list)
assert all(isinstance(result, ArchiveResult) for result in results)
except Exception:
stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
raise
def _asdict(self, extended=False):
info = {

View file

@ -0,0 +1 @@
__package__ = 'archivebox.legacy.storage'

View file

@ -0,0 +1,126 @@
import os
from datetime import datetime
from typing import List, Optional
from ..schema import Link
from ..config import (
OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
ARCHIVE_DIR_NAME,
)
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
render_template,
atomic_write,
copy_and_overwrite,
)
join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
TITLE_LOADING_MSG = 'Not yet archived...'
### Main Links Index
@enforce_types
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(join(TEMPLATES_DIR, 'favicon.ico'), join(out_dir, 'favicon.ico'))
copy_and_overwrite(join(TEMPLATES_DIR, 'robots.txt'), join(out_dir, 'robots.txt'))
copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
rendered_html = main_index_template(links, finished=finished)
atomic_write(rendered_html, join(out_dir, 'index.html'))
@enforce_types
def main_index_template(links: List[Link], finished: bool=True) -> str:
"""render the template for the entire main index"""
return render_template(MAIN_INDEX_TEMPLATE, {
'version': VERSION,
'git_sha': GIT_SHA,
'num_links': str(len(links)),
'status': 'finished' if finished else 'running',
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'rows': '\n'.join(
main_index_row_template(link)
for link in links
),
'footer_info': FOOTER_INFO,
})
@enforce_types
def main_index_row_template(link: Link) -> str:
"""render the template for an individual link row of the main index"""
return render_template(MAIN_INDEX_ROW_TEMPLATE, {
**link._asdict(extended=True),
# before pages are finished archiving, show loading msg instead of title
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
# before pages are finished archiving, show fallback loading favicon
'favicon_url': (
join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
# if link['is_archived'] else ''
),
# before pages are finished archiving, show the details page instead
'wget_url': urlencode(wget_output_path(link) or 'index.html'),
# replace commas in tags with spaces, or file extension if it's static
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
})
### Link Details Index
@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
atomic_write(rendered_html, join(out_dir, 'index.html'))
@enforce_types
def link_details_template(link: Link) -> str:
link_info = link._asdict(extended=True)
return render_template(LINK_DETAILS_TEMPLATE, {
**link_info,
**link_info['canonical'],
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'url_str': htmlencode(urldecode(link.base_url)),
'archive_url': urlencode(
wget_output_path(link)
or (link.domain if link.is_archived else 'about:blank')
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
})

View file

@ -0,0 +1,81 @@
import os
import json
from datetime import datetime
from typing import List, Optional, Iterator
from ..schema import Link, ArchiveResult
from ..config import (
VERSION,
OUTPUT_DIR,
)
from ..util import (
enforce_types,
atomic_write,
)
### Main Links Index
@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
return ()
@enforce_types
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path"""
assert isinstance(links, List), 'Links must be a list, not a generator.'
assert not links or isinstance(links[0].history, dict)
assert not links or isinstance(links[0].sources, list)
if links and links[0].history.get('title'):
assert isinstance(links[0].history['title'][0], ArchiveResult)
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
atomic_write(index_json, path)
### Link Details Index
@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, 'index.json')
atomic_write(link._asdict(extended=True), path)
@enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
return Link.from_json(link_json)
return None

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View file

@ -246,7 +246,7 @@
</a>
</div>
<div class="col-lg-8">
<img src="$link_dir/$favicon_url" alt="Favicon">
<img src="$link_dir/favicon.ico" alt="Favicon">
&nbsp;&nbsp;
$title
&nbsp;&nbsp;
@ -325,36 +325,36 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$dom_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="$dom_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$dom_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<a href="$dom_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$dom_url" target="preview"><h4 class="card-title">HTML</h4></a>
<a href="$dom_path" target="preview"><h4 class="card-title">HTML</h4></a>
<p class="card-text">archive/output.html</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top pdf-frame" src="$pdf_url" scrolling="no"></iframe>
<iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
<div class="card-body">
<a href="$pdf_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<a href="$pdf_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$pdf_url" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
<a href="$pdf_path" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
<p class="card-text">archive/output.pdf</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<img class="card-img-top screenshot" src="$screenshot_url"></iframe>
<img class="card-img-top screenshot" src="$screenshot_path"></iframe>
<div class="card-body">
<a href="$screenshot_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<a href="$screenshot_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$screenshot_url" target="preview"><h4 class="card-title">Screenshot</h4></a>
<a href="$screenshot_path" target="preview"><h4 class="card-title">Screenshot</h4></a>
<p class="card-text">archive/screenshot.png</p>
</div>
</div>
@ -373,12 +373,12 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$archive_org_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="$archive_org_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$archive_org_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<a href="$archive_org_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$archive_org_url" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<a href="$archive_org_path" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<p class="card-text">web.archive.org/web/...</p>
</div>
</div>

View file

@ -1,14 +1,14 @@
<tr>
<td title="$timestamp">$bookmarked_date</td>
<td class="title-col">
<a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
<a href="$archive_path/index.html"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
<a href="$archive_path/$wget_url" title="$title">
<span data-title-for="$url" data-archived="$is_archived">$title</span>
<small style="float:right">$tags</small>
</a>
</td>
<td>
<a href="$archive_path/$index_url">📄
<a href="$archive_path/index.html">📄
<span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner" decoding="async"/></span>
</a>
</td>

View file

@ -0,0 +1,2 @@
User-agent: *
Disallow: /

View file

@ -5,8 +5,9 @@ import json
import time
import shutil
from string import Template
from json import JSONEncoder
from typing import List, Optional, Any, Union, IO
from typing import List, Optional, Any, Union, IO, Mapping
from inspect import signature
from functools import wraps
from hashlib import sha256
@ -396,6 +397,7 @@ def parse_date(date: Any) -> Optional[datetime]:
try:
return datetime.fromisoformat(date)
except Exception:
pass
try:
return datetime.strptime(date, '%Y-%m-%d %H:%M')
except Exception:
@ -552,9 +554,12 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
if os.path.exists(to_path):
shutil.rmtree(to_path)
if os.path.isdir(from_path):
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
@enforce_types
def chrome_args(**options) -> List[str]:
@ -642,11 +647,27 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
return '\n'.join((header_str, *row_strs))
def atomic_write(contents: Union[dict, str], path: str) -> None:
@enforce_types
def render_template(template_path: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
# will be replaced by django templates in the future
with open(template_path, 'r', encoding='utf-8') as template:
template_str = template.read()
return Template(template_str).substitute(**context)
def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
try:
tmp_file = '{}.tmp'.format(path)
with open(tmp_file, 'w+', encoding='utf-8') as f:
if isinstance(contents, bytes):
args = {'mode': 'wb+'}
else:
args = {'mode': 'w+', 'encoding': 'utf-8'}
with open(tmp_file, **args) as f:
if isinstance(contents, dict):
to_json(contents, file=f)
else:
@ -678,3 +699,5 @@ def reject_stdin(caller: str) -> None:
))
print()
raise SystemExit(1)