diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 9e460dc7..97eeb6a2 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -383,7 +383,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in links @@ -391,7 +391,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_archived, links) @@ -399,7 +399,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_unarchived, links) @@ -424,7 +424,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_valid, links) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 04340957..4ead04ce 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -46,6 +46,14 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: yield line.split('"')[1] return () +@enforce_types +def generate_index_from_links(links: List[Link], with_headers: bool): + if with_headers: + output = main_index_template(links) + else: + output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE) + return output + @enforce_types def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: """render the template for the entire main index""" diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 1c3ce6e8..f24b969f 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -8,7 +8,7 @@ from pathlib import Path from datetime import datetime from typing import List, Optional, Iterator, Any, Union -from .schema import Link, ArchiveResult +from .schema import Link from ..system import atomic_write from ..util import enforce_types from ..config import ( @@ -39,7 +39,20 @@ MAIN_INDEX_HEADER = { }, } -### Main Links Index +@enforce_types +def generate_json_index_from_links(links: List[Link], with_headers: bool): + if with_headers: + output = { + **MAIN_INDEX_HEADER, + 'num_links': len(links), + 'updated': datetime.now(), + 'last_run_cmd': sys.argv, + 'links': links, + } + else: + output = links + return to_json(output, indent=4, sort_keys=True) + @enforce_types def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: @@ -65,30 +78,6 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: continue return () -@enforce_types -def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - """write the json link index to a given path""" - - assert isinstance(links, List), 'Links must be a list, not a generator.' - assert not links or isinstance(links[0].history, dict) - assert not links or isinstance(links[0].sources, list) - - if links and links[0].history.get('title'): - assert isinstance(links[0].history['title'][0], ArchiveResult) - - if links and links[0].sources: - assert isinstance(links[0].sources[0], str) - - main_index_json = { - **MAIN_INDEX_HEADER, - 'num_links': len(links), - 'updated': datetime.now(), - 'last_run_cmd': sys.argv, - 'links': links, - } - atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json) - - ### Link Details Index @enforce_types diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 8648e0a4..bc65e276 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -477,39 +477,7 @@ def printable_filesize(num_bytes: Union[int, float]) -> str: @enforce_types def printable_folders(folders: Dict[str, Optional["Link"]], - json: bool=False, - html: bool=False, - csv: Optional[str]=None, with_headers: bool=False) -> str: - - from .index.json import MAIN_INDEX_HEADER - - links = folders.values() - if json: - from .index.json import to_json - if with_headers: - output = { - **MAIN_INDEX_HEADER, - 'num_links': len(links), - 'updated': datetime.now(), - 'last_run_cmd': sys.argv, - 'links': links, - } - else: - output = links - return to_json(output, indent=4, sort_keys=True) - elif html: - from .index.html import main_index_template - if with_headers: - output = main_index_template(links) - else: - from .index.html import MINIMAL_INDEX_TEMPLATE - output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE) - return output - elif csv: - from .index.csv import links_to_csv - return links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) - return '\n'.join( f'{folder} {link and link.url} "{link and link.title}"' for folder, link in folders.items() diff --git a/archivebox/main.py b/archivebox/main.py index 94658a8f..2d36e1f2 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -49,12 +49,17 @@ from .index import ( from .index.json import ( parse_json_main_index, parse_json_links_details, + generate_json_index_from_links, ) from .index.sql import ( get_admins, apply_migrations, remove_from_sql_main_index, ) +from .index.html import ( + generate_index_from_links, +) +from .index.csv import links_to_csv from .extractors import archive_links, archive_link, ignore_methods from .config import ( stderr, @@ -745,7 +750,6 @@ def list_all(filter_patterns_str: Optional[str]=None, elif filter_patterns_str: filter_patterns = filter_patterns_str.split('\n') - snapshots = list_links( filter_patterns=filter_patterns, filter_type=filter_type, @@ -761,8 +765,16 @@ def list_all(filter_patterns_str: Optional[str]=None, status=status, out_dir=out_dir, ) - - print(printable_folders(folders, json=json, csv=csv, html=html, with_headers=with_headers)) + + if json: + output = generate_json_index_from_links(folders.values(), with_headers) + elif html: + output = generate_index_from_links(folders.values(), with_headers) + elif csv: + output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) + else: + output = printable_folders(folders, with_headers=with_headers) + print(output) return folders