mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-23 12:43:10 +00:00
Merge pull request #555 from cdvv7788/cleanup
This commit is contained in:
commit
04291c4d47
5 changed files with 42 additions and 65 deletions
|
@ -383,7 +383,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||||
|
|
||||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links without checking archive status or data directory validity"""
|
"""indexed links without checking archive status or data directory validity"""
|
||||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in links
|
for link in links
|
||||||
|
@ -391,7 +391,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
|
|
||||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are archived with a valid data directory"""
|
"""indexed links that are archived with a valid data directory"""
|
||||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_archived, links)
|
for link in filter(is_archived, links)
|
||||||
|
@ -399,7 +399,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
|
|
||||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_unarchived, links)
|
for link in filter(is_unarchived, links)
|
||||||
|
@ -424,7 +424,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
|
|
||||||
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs with a valid index matched to the main index and archived content"""
|
"""dirs with a valid index matched to the main index and archived content"""
|
||||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_valid, links)
|
for link in filter(is_valid, links)
|
||||||
|
|
|
@ -46,6 +46,14 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
|
||||||
yield line.split('"')[1]
|
yield line.split('"')[1]
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def generate_index_from_links(links: List[Link], with_headers: bool):
|
||||||
|
if with_headers:
|
||||||
|
output = main_index_template(links)
|
||||||
|
else:
|
||||||
|
output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
|
||||||
|
return output
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
|
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
|
||||||
"""render the template for the entire main index"""
|
"""render the template for the entire main index"""
|
||||||
|
|
|
@ -8,7 +8,7 @@ from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional, Iterator, Any, Union
|
from typing import List, Optional, Iterator, Any, Union
|
||||||
|
|
||||||
from .schema import Link, ArchiveResult
|
from .schema import Link
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
from ..config import (
|
from ..config import (
|
||||||
|
@ -39,7 +39,20 @@ MAIN_INDEX_HEADER = {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
### Main Links Index
|
@enforce_types
|
||||||
|
def generate_json_index_from_links(links: List[Link], with_headers: bool):
|
||||||
|
if with_headers:
|
||||||
|
output = {
|
||||||
|
**MAIN_INDEX_HEADER,
|
||||||
|
'num_links': len(links),
|
||||||
|
'updated': datetime.now(),
|
||||||
|
'last_run_cmd': sys.argv,
|
||||||
|
'links': links,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
output = links
|
||||||
|
return to_json(output, indent=4, sort_keys=True)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
|
@ -65,30 +78,6 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
continue
|
continue
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
@enforce_types
|
|
||||||
def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
|
||||||
"""write the json link index to a given path"""
|
|
||||||
|
|
||||||
assert isinstance(links, List), 'Links must be a list, not a generator.'
|
|
||||||
assert not links or isinstance(links[0].history, dict)
|
|
||||||
assert not links or isinstance(links[0].sources, list)
|
|
||||||
|
|
||||||
if links and links[0].history.get('title'):
|
|
||||||
assert isinstance(links[0].history['title'][0], ArchiveResult)
|
|
||||||
|
|
||||||
if links and links[0].sources:
|
|
||||||
assert isinstance(links[0].sources[0], str)
|
|
||||||
|
|
||||||
main_index_json = {
|
|
||||||
**MAIN_INDEX_HEADER,
|
|
||||||
'num_links': len(links),
|
|
||||||
'updated': datetime.now(),
|
|
||||||
'last_run_cmd': sys.argv,
|
|
||||||
'links': links,
|
|
||||||
}
|
|
||||||
atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json)
|
|
||||||
|
|
||||||
|
|
||||||
### Link Details Index
|
### Link Details Index
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -477,39 +477,7 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def printable_folders(folders: Dict[str, Optional["Link"]],
|
def printable_folders(folders: Dict[str, Optional["Link"]],
|
||||||
json: bool=False,
|
|
||||||
html: bool=False,
|
|
||||||
csv: Optional[str]=None,
|
|
||||||
with_headers: bool=False) -> str:
|
with_headers: bool=False) -> str:
|
||||||
|
|
||||||
from .index.json import MAIN_INDEX_HEADER
|
|
||||||
|
|
||||||
links = folders.values()
|
|
||||||
if json:
|
|
||||||
from .index.json import to_json
|
|
||||||
if with_headers:
|
|
||||||
output = {
|
|
||||||
**MAIN_INDEX_HEADER,
|
|
||||||
'num_links': len(links),
|
|
||||||
'updated': datetime.now(),
|
|
||||||
'last_run_cmd': sys.argv,
|
|
||||||
'links': links,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
output = links
|
|
||||||
return to_json(output, indent=4, sort_keys=True)
|
|
||||||
elif html:
|
|
||||||
from .index.html import main_index_template
|
|
||||||
if with_headers:
|
|
||||||
output = main_index_template(links)
|
|
||||||
else:
|
|
||||||
from .index.html import MINIMAL_INDEX_TEMPLATE
|
|
||||||
output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
|
|
||||||
return output
|
|
||||||
elif csv:
|
|
||||||
from .index.csv import links_to_csv
|
|
||||||
return links_to_csv(folders.values(), cols=csv.split(','), header=with_headers)
|
|
||||||
|
|
||||||
return '\n'.join(
|
return '\n'.join(
|
||||||
f'{folder} {link and link.url} "{link and link.title}"'
|
f'{folder} {link and link.url} "{link and link.title}"'
|
||||||
for folder, link in folders.items()
|
for folder, link in folders.items()
|
||||||
|
|
|
@ -49,12 +49,17 @@ from .index import (
|
||||||
from .index.json import (
|
from .index.json import (
|
||||||
parse_json_main_index,
|
parse_json_main_index,
|
||||||
parse_json_links_details,
|
parse_json_links_details,
|
||||||
|
generate_json_index_from_links,
|
||||||
)
|
)
|
||||||
from .index.sql import (
|
from .index.sql import (
|
||||||
get_admins,
|
get_admins,
|
||||||
apply_migrations,
|
apply_migrations,
|
||||||
remove_from_sql_main_index,
|
remove_from_sql_main_index,
|
||||||
)
|
)
|
||||||
|
from .index.html import (
|
||||||
|
generate_index_from_links,
|
||||||
|
)
|
||||||
|
from .index.csv import links_to_csv
|
||||||
from .extractors import archive_links, archive_link, ignore_methods
|
from .extractors import archive_links, archive_link, ignore_methods
|
||||||
from .config import (
|
from .config import (
|
||||||
stderr,
|
stderr,
|
||||||
|
@ -745,7 +750,6 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
||||||
elif filter_patterns_str:
|
elif filter_patterns_str:
|
||||||
filter_patterns = filter_patterns_str.split('\n')
|
filter_patterns = filter_patterns_str.split('\n')
|
||||||
|
|
||||||
|
|
||||||
snapshots = list_links(
|
snapshots = list_links(
|
||||||
filter_patterns=filter_patterns,
|
filter_patterns=filter_patterns,
|
||||||
filter_type=filter_type,
|
filter_type=filter_type,
|
||||||
|
@ -762,7 +766,15 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(printable_folders(folders, json=json, csv=csv, html=html, with_headers=with_headers))
|
if json:
|
||||||
|
output = generate_json_index_from_links(folders.values(), with_headers)
|
||||||
|
elif html:
|
||||||
|
output = generate_index_from_links(folders.values(), with_headers)
|
||||||
|
elif csv:
|
||||||
|
output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers)
|
||||||
|
else:
|
||||||
|
output = printable_folders(folders, with_headers=with_headers)
|
||||||
|
print(output)
|
||||||
return folders
|
return folders
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue