diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 4cd3afd5..485a340d 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,3 +1,6 @@ __package__ = 'archivebox' from . import core +from . import cli + +from .main import * diff --git a/archivebox/__main__.py b/archivebox/__main__.py index 570a8c21..3386d46d 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -2,9 +2,14 @@ __package__ = 'archivebox' -from .cli.archivebox import main +import sys +from .cli import archivebox + + +def main(): + archivebox.main(args=sys.argv[1:], stdin=sys.stdin) if __name__ == '__main__': - main() + archivebox.main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 38c577c7..f4cd99b9 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -2,13 +2,17 @@ __package__ = 'archivebox.cli' import os -from typing import Dict +from typing import Dict, List, Optional, IO from importlib import import_module CLI_DIR = os.path.dirname(os.path.abspath(__file__)) # these common commands will appear sorted before any others for ease-of-use -display_first = ('help', 'version', 'init', 'info', 'config', 'list', 'update', 'add', 'remove') +meta_cmds = ('help', 'version') +main_cmds = ('init', 'info', 'config') +archive_cmds = ('add', 'remove', 'update', 'list') + +display_first = (*meta_cmds, *main_cmds, *archive_cmds) # every imported command module must have these properties in order to be valid required_attrs = ('__package__', '__command__', 'main') @@ -42,11 +46,14 @@ def list_subcommands() -> Dict[str, str]: return dict(sorted(COMMANDS, key=display_order)) -def run_subcommand(subcommand: str, args=None) -> None: +def run_subcommand(subcommand: str, + subcommand_args: List[str]=None, + stdin: Optional[IO]=None, + pwd: Optional[str]=None) -> None: """run a given ArchiveBox subcommand with the given list of args""" module = import_module('.archivebox_{}'.format(subcommand), __package__) - module.main(args) # type: ignore + module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore SUBCOMMANDS = list_subcommands() diff --git a/archivebox/cli/archivebox.py b/archivebox/cli/archivebox.py index d1326721..d6fe207c 100755 --- a/archivebox/cli/archivebox.py +++ b/archivebox/cli/archivebox.py @@ -5,19 +5,17 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox' __description__ = 'ArchiveBox: The self-hosted internet archive.' -import os import sys import argparse +from typing import Optional, List, IO + from . import list_subcommands, run_subcommand -from ..legacy.config import OUTPUT_DIR +from ..config import OUTPUT_DIR -def parse_args(args=None): - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: subcommands = list_subcommands() - parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -43,54 +41,24 @@ def parse_args(args=None): default=None, ) parser.add_argument( - "args", + "subcommand_args", help="Arguments for the subcommand", nargs=argparse.REMAINDER, ) - - command = parser.parse_args(args) + command = parser.parse_args(args or ()) - if command.help: + if command.help or command.subcommand is None: command.subcommand = 'help' if command.version: command.subcommand = 'version' - # print('--------------------------------------------') - # print('Command: ', sys.argv[0]) - # print('Subcommand: ', command.subcommand) - # print('Args to pass:', args[1:]) - # print('--------------------------------------------') + run_subcommand( + subcommand=command.subcommand, + subcommand_args=command.subcommand_args, + stdin=stdin, + pwd=pwd or OUTPUT_DIR, + ) - return command.subcommand, command.args - - -def print_import_tutorial(): - print('Welcome to ArchiveBox!') - print() - print('To import an existing archive (from a previous version of ArchiveBox):') - print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:') - print(' 2. archivebox init') - print() - print('To start a new archive:') - print(' 1. Create an emptry directory, then cd into it and run:') - print(' 2. archivebox init') - print() - print('For more information, see the migration docs here:') - print(' https://github.com/pirate/ArchiveBox/wiki/Migration') - -def main(args=None): - subcommand, subcommand_args = parse_args(args) - existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')) - - if subcommand is None: - if existing_index: - run_subcommand('help', subcommand_args) - else: - print_import_tutorial() - raise SystemExit(0) - - run_subcommand(subcommand, subcommand_args) - if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 714e916c..d0209916 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -7,90 +7,75 @@ __description__ = 'Add a new URL or list of URLs to your archive' import sys import argparse -from typing import List, Optional +from typing import List, Optional, IO -from ..legacy.config import stderr, check_dependencies, check_data_folder -from ..legacy.util import ( - handle_stdin_import, - handle_file_import, -) -from ..legacy.main import update_archive_data +from ..main import add +from ..util import SmartFormatter, accept_stdin +from ..config import OUTPUT_DIR, ONLY_NEW -def main(args: List[str]=None, stdin: Optional[str]=None) -> None: - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, + formatter_class=SmartFormatter, ) - # parser.add_argument( - # '--depth', #'-d', - # type=int, - # help='Recursively archive all linked pages up to this many hops away', - # default=0, - # ) parser.add_argument( - '--only-new', #'-n', + '--update-all', #'-n', action='store_true', - help="Don't attempt to retry previously skipped/failed links when updating", + default=not ONLY_NEW, + help="Also retry previously skipped/failed links when adding new links", ) parser.add_argument( '--index-only', #'-o', action='store_true', help="Add the links to the main index without archiving them", ) - # parser.add_argument( - # '--mirror', #'-m', - # action='store_true', - # help='Archive an entire site (finding all linked pages below it on the same domain)', - # ) - # parser.add_argument( - # '--crawler', #'-r', - # choices=('depth_first', 'breadth_first'), - # help='Controls which crawler to use in order to find outlinks in a given page', - # default=None, - # ) parser.add_argument( - 'url', + 'import_path', nargs='?', type=str, default=None, - help='URL of page to archive (or path to local file)' + help=( + 'URL or path to local file containing a list of links to import. e.g.:\n' + ' https://getpocket.com/users/USERNAME/feed/all\n' + ' https://example.com/some/rss/feed.xml\n' + ' ~/Downloads/firefox_bookmarks_export.html\n' + ' ~/Desktop/sites_list.csv\n' + ) ) - command = parser.parse_args(args) - - check_dependencies() - - ### Handle ingesting urls piped in through stdin - # (.e.g if user does cat example_urls.txt | archivebox add) - import_path = None - if stdin or not sys.stdin.isatty(): - stdin_raw_text = stdin or sys.stdin.read() - if stdin_raw_text and command.url: - stderr( - '[X] You should pass either a path as an argument, ' - 'or pass a list of links via stdin, but not both.\n' - ) - raise SystemExit(1) - - import_path = handle_stdin_import(stdin_raw_text) - - ### Handle ingesting url from a remote file/feed - # (e.g. if an RSS feed URL is used as the import path) - elif command.url: - import_path = handle_file_import(command.url) - - update_archive_data( - import_path=import_path, - resume=None, - only_new=command.only_new, + command = parser.parse_args(args or ()) + import_str = accept_stdin(stdin) + add( + import_str=import_str, + import_path=command.import_path, + update_all=command.update_all, index_only=command.index_only, + out_dir=pwd or OUTPUT_DIR, ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) + + +# TODO: Implement these +# +# parser.add_argument( +# '--depth', #'-d', +# type=int, +# help='Recursively archive all linked pages up to this many hops away', +# default=0, +# ) +# parser.add_argument( +# '--mirror', #'-m', +# action='store_true', +# help='Archive an entire site (finding all linked pages below it on the same domain)', +# ) +# parser.add_argument( +# '--crawler', #'-r', +# choices=('depth_first', 'breadth_first'), +# help='Controls which crawler to use in order to find outlinks in a given page', +# default=None, +# ) diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 97a8447d..2d373535 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -7,28 +7,14 @@ __description__ = 'Get and set your ArchiveBox project configuration values' import sys import argparse -from typing import Optional, List +from typing import Optional, List, IO -from ..legacy.util import SmartFormatter -from ..legacy.config import ( - check_data_folder, - OUTPUT_DIR, - load_all_config, - write_config_file, - CONFIG, - CONFIG_FILE, - USER_CONFIG, - ConfigDict, - stderr, - get_real_name, -) +from ..main import config +from ..util import SmartFormatter, accept_stdin +from ..config import OUTPUT_DIR -def main(args: List[str]=None, stdin: Optional[str]=None) -> None: - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -57,102 +43,18 @@ def main(args: List[str]=None, stdin: Optional[str]=None) -> None: type=str, help='KEY or KEY=VALUE formatted config values to get or set', ) - command = parser.parse_args(args) + command = parser.parse_args(args or ()) + config_options_str = accept_stdin(stdin) - if stdin or not sys.stdin.isatty(): - stdin_raw_text = stdin or sys.stdin.read() - if stdin_raw_text and command.config_options: - stderr( - '[X] You should either pass config values as an arguments ' - 'or via stdin, but not both.\n', - color='red', - ) - raise SystemExit(1) - - config_options = stdin_raw_text.split('\n') - else: - config_options = command.config_options - - no_args = not (command.get or command.set or command.reset or command.config_options) - - matching_config: ConfigDict = {} - if command.get or no_args: - if config_options: - config_options = [get_real_name(key) for key in config_options] - matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG} - failed_config = [key for key in config_options if key not in CONFIG] - if failed_config: - stderr() - stderr('[X] These options failed to get', color='red') - stderr(' {}'.format('\n '.join(config_options))) - raise SystemExit(1) - else: - matching_config = CONFIG - - print(printable_config(matching_config)) - raise SystemExit(not matching_config) - elif command.set: - new_config = {} - failed_options = [] - for line in config_options: - if line.startswith('#') or not line.strip(): - continue - if '=' not in line: - stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') - stderr(f' {line}') - raise SystemExit(2) - - raw_key, val = line.split('=') - raw_key = raw_key.upper().strip() - key = get_real_name(raw_key) - if key != raw_key: - stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') - - if key in CONFIG: - new_config[key] = val.strip() - else: - failed_options.append(line) - - if new_config: - before = CONFIG - matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR) - after = load_all_config() - print(printable_config(matching_config)) - - side_effect_changes: ConfigDict = {} - for key, val in after.items(): - if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config): - side_effect_changes[key] = after[key] - - if side_effect_changes: - stderr() - stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') - print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) - if failed_options: - stderr() - stderr('[X] These options failed to set:', color='red') - stderr(' {}'.format('\n '.join(failed_options))) - raise SystemExit(bool(failed_options)) - elif command.reset: - stderr('[X] This command is not implemented yet.', color='red') - stderr(' Please manually remove the relevant lines from your config file:') - stderr(f' {CONFIG_FILE}') - raise SystemExit(2) - - else: - stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') - stderr(' archivebox config') - stderr(' archivebox config --get SOME_KEY') - stderr(' archivebox config --set SOME_KEY=SOME_VALUE') - raise SystemExit(2) - - -def printable_config(config: ConfigDict, prefix: str='') -> str: - return f'\n{prefix}'.join( - f'{key}={val}' - for key, val in config.items() - if not (isinstance(val, dict) or callable(val)) + config( + config_options_str=config_options_str, + config_options=command.config_options, + get=command.get, + set=command.set, + reset=command.reset, + out_dir=pwd or OUTPUT_DIR, ) + if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py index b049ef70..b1cf1c5b 100755 --- a/archivebox/cli/archivebox_help.py +++ b/archivebox/cli/archivebox_help.py @@ -7,52 +7,24 @@ __description__ = 'Print the ArchiveBox help message and usage' import sys import argparse -from ..legacy.util import reject_stdin -from ..legacy.config import ANSI -from . import list_subcommands +from typing import Optional, List, IO + +from ..main import help +from ..util import reject_stdin +from ..config import OUTPUT_DIR -def main(args=None): - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, ) - parser.parse_args(args) - reject_stdin(__command__) + parser.parse_args(args or ()) + reject_stdin(__command__, stdin) - - COMMANDS_HELP_TEXT = '\n '.join( - f'{cmd.ljust(20)} {summary}' - for cmd, summary in list_subcommands().items() - ) - - print('''{green}ArchiveBox: The self-hosted internet archive.{reset} - -{lightblue}Usage:{reset} - archivebox [command] [--help] [--version] [...args] - -{lightblue}Comamnds:{reset} - {} - -{lightblue}Example Use:{reset} - mkdir my-archive; cd my-archive/ - archivebox init - archivebox info - - archivebox add https://example.com/some/page - archivebox add --depth=1 ~/Downloads/bookmarks_export.html - - archivebox list --sort=timestamp --csv=timestamp,url,is_archived - archivebox schedule --every=week https://example.com/some/feed.rss - archivebox update --resume=15109948213.123 - -{lightblue}Documentation:{reset} - https://github.com/pirate/ArchiveBox/wiki -'''.format(COMMANDS_HELP_TEXT, **ANSI)) + help(out_dir=pwd or OUTPUT_DIR) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_info.py b/archivebox/cli/archivebox_info.py index bf04d89e..d3cc99aa 100644 --- a/archivebox/cli/archivebox_info.py +++ b/archivebox/cli/archivebox_info.py @@ -7,25 +7,24 @@ __description__ = 'Print out some info and statistics about the archive collecti import sys import argparse -from ..legacy.config import check_data_folder -from ..legacy.util import reject_stdin -from ..legacy.main import info +from typing import Optional, List, IO + +from ..main import info +from ..config import OUTPUT_DIR +from ..util import reject_stdin -def main(args=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, ) - parser.parse_args(args) - reject_stdin(__command__) + parser.parse_args(args or ()) + reject_stdin(__command__, stdin) + + info(out_dir=pwd or OUTPUT_DIR) - info() if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 632b9a1e..a66f011c 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -7,23 +7,24 @@ __description__ = 'Initialize a new ArchiveBox collection in the current directo import sys import argparse -from ..legacy.util import reject_stdin -from ..legacy.main import init +from typing import Optional, List, IO + +from ..main import init +from ..util import reject_stdin +from ..config import OUTPUT_DIR -def main(args=None): - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, ) - parser.parse_args(args) - reject_stdin(__command__) + parser.parse_args(args or ()) + reject_stdin(__command__, stdin) - init() + init(out_dir=pwd or OUTPUT_DIR) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 416fa3d7..126ad144 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -2,15 +2,17 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox list' -__description__ = 'List all the URLs currently in the archive.' +__description__ = 'List, filter, and export information about archive entries' import sys import argparse -from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv -from ..legacy.config import check_data_folder, OUTPUT_DIR -from ..legacy.main import ( - list_archive_data, +from typing import Optional, List, IO + +from ..main import list_all +from ..util import SmartFormatter, accept_stdin +from ..config import OUTPUT_DIR +from ..index import ( get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -23,11 +25,7 @@ from ..legacy.main import ( get_unrecognized_folders, ) -def main(args=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -93,57 +91,27 @@ def main(args=None): help='Type of pattern matching to use when filtering URLs', ) parser.add_argument( - 'patterns', + 'filter_patterns', nargs='*', type=str, default=None, help='List only URLs matching these filter patterns.' ) - command = parser.parse_args(args) - reject_stdin(__command__) + command = parser.parse_args(args or ()) + filter_patterns_str = accept_stdin(stdin) - links = list_archive_data( - filter_patterns=command.patterns, + list_all( + filter_patterns_str=filter_patterns_str, + filter_patterns=command.filter_patterns, filter_type=command.filter_type, - before=command.before, + status=command.status, after=command.after, + before=command.before, + sort=command.sort, + csv=command.csv, + json=command.json, + out_dir=pwd or OUTPUT_DIR, ) - if command.sort: - links = sorted(links, key=lambda link: getattr(link, command.sort)) - - links = list(links) - - if command.status == 'indexed': - folders = get_indexed_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'archived': - folders = get_archived_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'unarchived': - folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR) - - elif command.status == 'present': - folders = get_present_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'valid': - folders = get_valid_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'invalid': - folders = get_invalid_folders(links, out_dir=OUTPUT_DIR) - - elif command.status == 'duplicate': - folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'orphaned': - folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'corrupted': - folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR) - elif command.status == 'unrecognized': - folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) - - if command.csv: - print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True)) - elif command.json: - print(to_json(folders.values(), indent=4, sort_keys=True)) - else: - print('\n'.join(f'{folder} {link}' for folder, link in folders.items())) - raise SystemExit(not folders) - if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py index 9d1c8eb3..f2b91cc2 100644 --- a/archivebox/cli/archivebox_manage.py +++ b/archivebox/cli/archivebox_manage.py @@ -6,24 +6,18 @@ __description__ = 'Run an ArchiveBox Django management command' import sys -from ..legacy.config import OUTPUT_DIR, setup_django, check_data_folder +from typing import Optional, List, IO + +from ..main import manage +from ..config import OUTPUT_DIR -def main(args=None): - check_data_folder() - - setup_django(OUTPUT_DIR) - from django.core.management import execute_from_command_line - - args = sys.argv if args is None else ['archivebox', *args] - - args[0] = f'{sys.argv[0]} manage' - - if args[1:] == []: - args.append('help') - - execute_from_command_line(args) +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: + manage( + args=args, + out_dir=pwd or OUTPUT_DIR, + ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index 4ddba354..c5f5ff53 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -7,17 +7,14 @@ __description__ = 'Remove the specified URLs from the archive.' import sys import argparse +from typing import Optional, List, IO -from ..legacy.config import check_data_folder -from ..legacy.util import reject_stdin -from ..legacy.main import remove_archive_links +from ..main import remove +from ..util import accept_stdin +from ..config import OUTPUT_DIR -def main(args=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -56,33 +53,25 @@ def main(args=None): help='Type of pattern matching to use when filtering URLs', ) parser.add_argument( - 'pattern', + 'filter_patterns', nargs='*', type=str, - default=None, help='URLs matching this filter pattern will be removed from the index.' ) - command = parser.parse_args(args) + command = parser.parse_args(args or ()) + filter_str = accept_stdin(stdin) - if not sys.stdin.isatty(): - stdin_raw_text = sys.stdin.read() - if stdin_raw_text and command.url: - print( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin, but not both.\n' - ) - raise SystemExit(1) - - patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')] - else: - patterns = command.pattern - - remove_archive_links( - filter_patterns=patterns, filter_type=command.filter_type, - before=command.before, after=command.after, - yes=command.yes, delete=command.delete, + remove( + filter_str=filter_str, + filter_patterns=command.filter_patterns, + filter_type=command.filter_type, + before=command.before, + after=command.after, + yes=command.yes, + delete=command.delete, + out_dir=pwd or OUTPUT_DIR, ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index f6e685f8..b6a15e13 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -4,34 +4,17 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox schedule' __description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron' -import os import sys import argparse -from datetime import datetime -from crontab import CronTab, CronSlices +from typing import Optional, List, IO + +from ..main import schedule +from ..util import reject_stdin +from ..config import OUTPUT_DIR -from ..legacy.util import reject_stdin -from ..legacy.config import ( - OUTPUT_DIR, - LOGS_DIR, - ARCHIVEBOX_BINARY, - USER, - ANSI, - stderr, - check_data_folder, -) - - -CRON_COMMENT = 'archivebox_schedule' - - -def main(args=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -57,7 +40,7 @@ def main(args=None): group.add_argument( '--clear', # '-c' action='store_true', - help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"), + help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"), ) group.add_argument( '--show', # '-s' @@ -67,13 +50,14 @@ def main(args=None): group.add_argument( '--foreground', '-f', action='store_true', - help=("Launch ArchiveBox as a long-running foreground task " + help=("Launch ArchiveBox scheduler as a long-running foreground task " "instead of using cron."), ) group.add_argument( '--run-all', # '-a', action='store_true', - help='Run all the scheduled jobs once immediately, independent of their configured schedules', + help=("Run all the scheduled jobs once immediately, independent of " + "their configured schedules, can be used together with --foreground"), ) parser.add_argument( 'import_path', @@ -83,115 +67,21 @@ def main(args=None): help=("Check this path and import any new links on every run " "(can be either local file or remote URL)"), ) - command = parser.parse_args(args) - reject_stdin(__command__) + command = parser.parse_args(args or ()) + reject_stdin(__command__, stdin) - os.makedirs(LOGS_DIR, exist_ok=True) - - cron = CronTab(user=True) - cron = dedupe_jobs(cron) - - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - if command.foreground or command.run_all: - if command.import_path or (not existing_jobs): - stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI)) - stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml') - raise SystemExit(1) - print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI)) - if command.run_all: - try: - for job in existing_jobs: - sys.stdout.write(f' > {job.command}') - sys.stdout.flush() - job.run() - sys.stdout.write(f'\r √ {job.command}\n') - except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) - raise SystemExit(1) - if command.foreground: - try: - for result in cron.run_scheduler(): - print(result) - except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) - raise SystemExit(1) - - elif command.show: - if existing_jobs: - print('\n'.join(str(cmd) for cmd in existing_jobs)) - else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI)) - stderr(' To schedule a new job, run:') - stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml') - raise SystemExit(0) - - elif command.clear: - print(cron.remove_all(comment=CRON_COMMENT)) - cron.write() - raise SystemExit(0) - - elif command.every: - quoted = lambda s: f'"{s}"' if s and ' ' in s else s - cmd = [ - 'cd', - quoted(OUTPUT_DIR), - '&&', - quoted(ARCHIVEBOX_BINARY), - *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)), - '2>&1', - '>', - quoted(os.path.join(LOGS_DIR, 'archivebox.log')), - - ] - new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) - - if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'): - set_every = getattr(new_job.every(), command.every) - set_every() - elif CronSlices.is_valid(command.every): - new_job.setall(command.every) - else: - stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI)) - stderr(' It must be one of minute/hour/day/week/month') - stderr(' or a quoted cron-format schedule like:') - stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml') - stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml') - raise SystemExit(1) - - cron = dedupe_jobs(cron) - cron.write() - - total_runs = sum(j.frequency_per_year() for j in cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI)) - print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) - if total_runs > 60 and not command.quiet: - stderr() - stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) - stderr(f' Congrats on being an enthusiastic internet archiver! πŸ‘Œ') - stderr() - stderr(' Make sure you have enough storage space available to hold all the data.') - stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') - raise SystemExit(0) - - -def dedupe_jobs(cron: CronTab) -> CronTab: - deduped = set() - for job in list(cron): - unique_tuple = (str(job.slices), job.command) - if unique_tuple not in deduped: - deduped.add(unique_tuple) - cron.remove(job) - - for schedule, command in deduped: - job = cron.new(command=command, comment=CRON_COMMENT) - job.setall(schedule) - job.enable() - - return cron + schedule( + add=command.add, + show=command.show, + clear=command.clear, + foreground=command.foreground, + run_all=command.run_all, + quiet=command.quiet, + every=command.every, + import_path=command.import_path, + out_dir=pwd or OUTPUT_DIR, + ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 3fdaff5c..a5cf4b2c 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -7,15 +7,14 @@ __description__ = 'Run the ArchiveBox HTTP server' import sys import argparse -from ..legacy.config import setup_django, IS_TTY, OUTPUT_DIR, ANSI, check_data_folder -from ..legacy.util import reject_stdin +from typing import Optional, List, IO + +from ..main import server +from ..util import reject_stdin +from ..config import OUTPUT_DIR -def main(args=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -33,26 +32,15 @@ def main(args=None): action='store_true', help='Enable auto-reloading when code or templates change', ) - command = parser.parse_args(args) - reject_stdin(__command__) + command = parser.parse_args(args or ()) + reject_stdin(__command__, stdin) - setup_django(OUTPUT_DIR) - from django.core.management import call_command - from django.contrib.auth.models import User - - if IS_TTY and not User.objects.filter(is_superuser=True).exists(): - print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI)) - print() - print(' To create an admin user, run:') - print(' archivebox manage createsuperuser') - print() - - print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI)) - if not command.reload: - command.runserver_args.append('--noreload') - - call_command("runserver", *command.runserver_args) + server( + runserver_args=command.runserver_args, + reload=command.reload, + out_dir=pwd or OUTPUT_DIR, + ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py index dd509e3f..04939328 100644 --- a/archivebox/cli/archivebox_shell.py +++ b/archivebox/cli/archivebox_shell.py @@ -7,27 +7,26 @@ __description__ = 'Enter an interactive ArchiveBox Django shell' import sys import argparse -from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder -from ..legacy.util import reject_stdin +from typing import Optional, List, IO + +from ..main import shell +from ..config import OUTPUT_DIR +from ..util import reject_stdin -def main(args=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, ) - parser.parse_args(args) - reject_stdin(__command__) + parser.parse_args(args or ()) + reject_stdin(__command__, stdin) + + shell( + out_dir=pwd or OUTPUT_DIR, + ) - setup_django(OUTPUT_DIR) - from django.core.management import call_command - call_command("shell_plus") - if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index e40b5b21..936e45ec 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -2,27 +2,36 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox update' -__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.' +__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links' import sys import argparse -from typing import List +from typing import List, Optional, IO -from ..legacy.config import check_data_folder -from ..legacy.util import reject_stdin -from ..legacy.main import update_archive_data +from ..main import update +from ..util import SmartFormatter, accept_stdin +from ..config import OUTPUT_DIR +from ..index import ( + get_indexed_folders, + get_archived_folders, + get_unarchived_folders, + get_present_folders, + get_valid_folders, + get_invalid_folders, + get_duplicate_folders, + get_orphaned_folders, + get_corrupted_folders, + get_unrecognized_folders, +) -def main(args: List[str]=None): - check_data_folder() - - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, + formatter_class=SmartFormatter, ) parser.add_argument( '--only-new', #'-n', @@ -40,16 +49,75 @@ def main(args: List[str]=None): help='Resume the update process from a given timestamp', default=None, ) + parser.add_argument( + '--overwrite', #'-x', + action='store_true', + help='Ignore existing archived content and overwrite with new versions (DANGEROUS)', + ) + parser.add_argument( + '--before', #'-b', + type=float, + help="Update only links bookmarked before the given timestamp.", + default=None, + ) + parser.add_argument( + '--after', #'-a', + type=float, + help="Update only links bookmarked after the given timestamp.", + default=None, + ) + parser.add_argument( + '--status', + type=str, + choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), + default='indexed', + help=( + 'Update only links or data directories that have the given status\n' + f' indexed {get_indexed_folders.__doc__} (the default)\n' + f' archived {get_archived_folders.__doc__}\n' + f' unarchived {get_unarchived_folders.__doc__}\n' + '\n' + f' present {get_present_folders.__doc__}\n' + f' valid {get_valid_folders.__doc__}\n' + f' invalid {get_invalid_folders.__doc__}\n' + '\n' + f' duplicate {get_duplicate_folders.__doc__}\n' + f' orphaned {get_orphaned_folders.__doc__}\n' + f' corrupted {get_corrupted_folders.__doc__}\n' + f' unrecognized {get_unrecognized_folders.__doc__}\n' + ) + ) + parser.add_argument( + '--filter-type', + type=str, + choices=('exact', 'substring', 'domain', 'regex'), + default='exact', + help='Type of pattern matching to use when filtering URLs', + ) + parser.add_argument( + 'filter_patterns', + nargs='*', + type=str, + default=None, + help='List only URLs matching these filter patterns.' + ) command = parser.parse_args(args) - reject_stdin(__command__) + filter_patterns_str = accept_stdin(stdin) - update_archive_data( - import_path=None, + update( resume=command.resume, only_new=command.only_new, index_only=command.index_only, + overwrite=command.overwrite, + filter_patterns_str=filter_patterns_str, + filter_patterns=command.filter_patterns, + filter_type=command.filter_type, + status=command.status, + after=command.after, + before=command.before, + out_dir=pwd or OUTPUT_DIR, ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 53fb4953..d3707161 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -4,26 +4,17 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox version' __description__ = 'Print the ArchiveBox version and dependency information' -import os -import re import sys import argparse -from ..legacy.util import reject_stdin, human_readable_size -from ..legacy.config import ( - ANSI, - VERSION, - CODE_LOCATIONS, - CONFIG_LOCATIONS, - DATA_LOCATIONS, - DEPENDENCIES, - check_dependencies, -) +from typing import Optional, List, IO + +from ..main import version +from ..util import reject_stdin +from ..config import OUTPUT_DIR -def main(args=None): - args = sys.argv[1:] if args is None else args - +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: parser = argparse.ArgumentParser( prog=__command__, description=__description__, @@ -34,92 +25,14 @@ def main(args=None): action='store_true', help='Only print ArchiveBox version number and nothing else.', ) - command = parser.parse_args(args) - reject_stdin(__command__) + command = parser.parse_args(args or ()) + reject_stdin(__command__, stdin) - if command.quiet: - print(VERSION) - else: - print('ArchiveBox v{}'.format(VERSION)) - print() - - print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) - for name, dependency in DEPENDENCIES.items(): - print_dependency_version(name, dependency) - - print() - print('{white}[i] Code locations:{reset}'.format(**ANSI)) - for name, folder in CODE_LOCATIONS.items(): - print_folder_status(name, folder) - - print() - print('{white}[i] Config locations:{reset}'.format(**ANSI)) - for name, folder in CONFIG_LOCATIONS.items(): - print_folder_status(name, folder) - - print() - print('{white}[i] Data locations:{reset}'.format(**ANSI)) - for name, folder in DATA_LOCATIONS.items(): - print_folder_status(name, folder) - - print() - check_dependencies() - - -def print_folder_status(name, folder): - if folder['enabled']: - if folder['is_valid']: - color, symbol, note = 'green', '√', 'valid' - else: - color, symbol, note, num_files = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' - - if folder['path']: - if os.path.exists(folder['path']): - num_files = ( - f'{len(os.listdir(folder["path"]))} files' - if os.path.isdir(folder['path']) else - human_readable_size(os.path.getsize(folder['path'])) - ) - else: - num_files = 'missing' - - print( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(24), - (folder["path"] or '').ljust(70), - num_files.ljust(14), - ANSI[color], - note, - ANSI['reset'], - ) - - -def print_dependency_version(name, dependency): - if dependency['enabled']: - if dependency['is_valid']: - color, symbol, note = 'green', '√', 'valid' - version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0] - else: - color, symbol, note, version = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' - - print( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(24), - (dependency["path"] or '').ljust(70), - version.ljust(14), - ANSI[color], - note, - ANSI['reset'], + version( + quiet=command.quiet, + out_dir=pwd or OUTPUT_DIR, ) if __name__ == '__main__': - main() + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/legacy/logs.py b/archivebox/cli/logging.py similarity index 94% rename from archivebox/legacy/logs.py rename to archivebox/cli/logging.py index e0b34301..87a7fab1 100644 --- a/archivebox/legacy/logs.py +++ b/archivebox/cli/logging.py @@ -1,3 +1,5 @@ +__package__ = 'archivebox.cli' + import os import sys @@ -5,8 +7,8 @@ from datetime import datetime from dataclasses import dataclass from typing import Optional, List -from .schema import Link, ArchiveResult -from .config import ANSI, OUTPUT_DIR, IS_TTY +from ..index.schema import Link, ArchiveResult +from ..config import ANSI, OUTPUT_DIR, IS_TTY @dataclass @@ -80,7 +82,7 @@ def log_indexing_finished(out_path: str): ### Archiving Stage -def log_archiving_started(num_links: int, resume: Optional[float]): +def log_archiving_started(num_links: int, resume: Optional[float]=None): start_ts = datetime.now() _LAST_RUN_STATS.archiving_start_ts = start_ts print() @@ -92,7 +94,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]): **ANSI, )) else: - print('{green}[β–Ά] [{}] Updating content for {} pages in archive...{reset}'.format( + print('{green}[β–Ά] [{}] Updating content for {} matching pages in archive...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), num_links, **ANSI, @@ -213,18 +215,18 @@ def log_archive_method_finished(result: ArchiveResult): print() -def log_list_started(filter_patterns: List[str], filter_type: str): +def log_list_started(filter_patterns: Optional[List[str]], filter_type: str): print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format( filter_type, **ANSI, )) - print(' {}'.format(' '.join(filter_patterns))) + print(' {}'.format(' '.join(filter_patterns or ()))) def log_list_finished(links): - from .util import to_csv + from ..util import links_to_csv print() print('---------------------------------------------------------------------------------------------------') - print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) + print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) print('---------------------------------------------------------------------------------------------------') print() diff --git a/archivebox/tests.py b/archivebox/cli/tests.py similarity index 97% rename from archivebox/tests.py rename to archivebox/cli/tests.py index 921fa1e7..14d0e4c6 100755 --- a/archivebox/tests.py +++ b/archivebox/cli/tests.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__package__ = 'archivebox' +__package__ = 'archivebox.cli' import os @@ -29,15 +29,15 @@ TEST_CONFIG = { OUTPUT_DIR = 'data.tests' os.environ.update(TEST_CONFIG) -from .legacy.main import init -from .legacy.index import load_main_index -from .legacy.config import ( +from ..main import init +from ..index import load_main_index +from ..config import ( SQL_INDEX_FILENAME, JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ) -from .cli import ( +from . import ( archivebox_init, archivebox_add, archivebox_remove, diff --git a/archivebox/legacy/config.py b/archivebox/config/__init__.py similarity index 96% rename from archivebox/legacy/config.py rename to archivebox/config/__init__.py index a38451d1..eb62d3d8 100644 --- a/archivebox/legacy/config.py +++ b/archivebox/config/__init__.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.legacy' +__package__ = 'archivebox.config' import os import io @@ -13,7 +13,7 @@ from typing import Optional, Type, Tuple, Dict from subprocess import run, PIPE, DEVNULL from configparser import ConfigParser -from .config_stubs import ( +from .stubs import ( SimpleConfigValueDict, ConfigValue, ConfigDict, @@ -40,7 +40,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GENERAL_CONFIG': { 'OUTPUT_DIR': {'type': str, 'default': None}, 'CONFIG_FILE': {'type': str, 'default': None}, - 'ONLY_NEW': {'type': bool, 'default': False}, + 'ONLY_NEW': {'type': bool, 'default': True}, 'TIMEOUT': {'type': int, 'default': 60}, 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'}, @@ -122,8 +122,7 @@ ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} VERSION_FILENAME = 'VERSION' PYTHON_DIR_NAME = 'archivebox' -LEGACY_DIR_NAME = 'legacy' -TEMPLATES_DIR_NAME = 'templates' +TEMPLATES_DIR_NAME = 'themes' ARCHIVE_DIR_NAME = 'archive' SOURCES_DIR_NAME = 'sources' @@ -158,8 +157,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))}, 'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)}, - 'LEGACY_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)}, - 'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)}, + 'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')}, 'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)}, 'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)}, @@ -210,7 +208,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, - 'CONFIG_LOCATIONS': {'default': lambda c: get_config_locations(c)}, + 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, } @@ -370,6 +368,7 @@ def load_config(defaults: ConfigDefaultDict, stderr(' For config documentation and examples see:') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration') stderr() + raise raise SystemExit(2) return extended_config @@ -492,18 +491,13 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: 'REPO_DIR': { 'path': os.path.abspath(config['REPO_DIR']), 'enabled': True, - 'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')), + 'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')), }, 'PYTHON_DIR': { 'path': os.path.abspath(config['PYTHON_DIR']), 'enabled': True, 'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')), }, - 'LEGACY_DIR': { - 'path': os.path.abspath(config['LEGACY_DIR']), - 'enabled': True, - 'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')), - }, 'TEMPLATES_DIR': { 'path': os.path.abspath(config['TEMPLATES_DIR']), 'enabled': True, @@ -511,14 +505,9 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: }, } -def get_config_locations(config: ConfigDict) -> ConfigValue: +def get_external_locations(config: ConfigDict) -> ConfigValue: abspath = lambda path: None if path is None else os.path.abspath(path) return { - 'CONFIG_FILE': { - 'path': abspath(config['CHROME_USER_DATA_DIR']), - 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'], - 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')), - }, 'CHROME_USER_DATA_DIR': { 'path': abspath(config['CHROME_USER_DATA_DIR']), 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'], @@ -553,11 +542,26 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': os.path.exists(config['ARCHIVE_DIR']), }, + 'CONFIG_FILE': { + 'path': os.path.abspath(config['CONFIG_FILE']), + 'enabled': True, + 'is_valid': os.path.exists(config['CONFIG_FILE']), + }, 'SQL_INDEX': { + 'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)), + }, + 'JSON_INDEX': { 'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), 'enabled': True, 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), }, + 'HTML_INDEX': { + 'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)), + }, } def get_dependency_info(config: ConfigDict) -> ConfigValue: @@ -731,7 +735,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME)) if not json_index_exists: - stderr('[X] No archive index was found in current directory.', color='red') + stderr('[X] No archive main index was found in current directory.', color='red') stderr(f' {output_dir}') stderr() stderr(' Are you running archivebox in the right folder?') @@ -743,7 +747,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> raise SystemExit(2) sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME)) - from .storage.sql import list_migrations + from ..index.sql import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] diff --git a/archivebox/legacy/config_stubs.py b/archivebox/config/stubs.py similarity index 99% rename from archivebox/legacy/config_stubs.py rename to archivebox/config/stubs.py index b741bc3a..f7d5059a 100644 --- a/archivebox/legacy/config_stubs.py +++ b/archivebox/config/stubs.py @@ -17,6 +17,7 @@ class ConfigDict(BaseConfig, total=False): SHOW_PROGRESS: bool OUTPUT_DIR: str + CONFIG_FILE: str ONLY_NEW: bool TIMEOUT: int MEDIA_TIMEOUT: int @@ -63,7 +64,6 @@ class ConfigDict(BaseConfig, total=False): ANSI: Dict[str, str] REPO_DIR: str PYTHON_DIR: str - LEGACY_DIR: str TEMPLATES_DIR: str ARCHIVE_DIR: str SOURCES_DIR: str diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index d131d3e8..23fe3286 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,9 +1,7 @@ - -from datetime import datetime - from django.contrib import admin -from .models import Page +from core.models import Page + class PageAdmin(admin.ModelAdmin): list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash') diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 9c82c61d..a41f3d1c 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -4,8 +4,8 @@ import uuid from django.db import models -from legacy.schema import Link -from legacy.util import parse_date +from ..util import parse_date +from ..index.schema import Link class Page(models.Model): diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 90d54080..7411ab9c 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -2,8 +2,8 @@ from django.shortcuts import render from django.views import View -from legacy.config import OUTPUT_DIR -from legacy.index import load_main_index, load_main_index_meta +from .index import load_main_index, load_main_index_meta +from .config import OUTPUT_DIR class MainIndex(View): @@ -34,7 +34,7 @@ class AddLinks(View): def post(self, request): import_path = request.POST['url'] - # TODO: add the links to the index here using archivebox.legacy.main.update_archive_data + # TODO: add the links to the index here using archivebox.main.add print(f'Adding URL: {import_path}') return render(template_name=self.template, request=request, context={}) diff --git a/archivebox/core/welcome_message.py b/archivebox/core/welcome_message.py index b3a9ebf8..70410c75 100644 --- a/archivebox/core/welcome_message.py +++ b/archivebox/core/welcome_message.py @@ -1,4 +1,17 @@ -print() -print('[i] Welcome to the ArchiveBox Shell! Example usage:') -print(' Page.objects.all()') -print(' User.objects.all()') +from cli import list_subcommands + +from .config import ANSI + + +if __name__ == '__main__': + print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) + # print('from archivebox.core.models import Page, User') + print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI)) + print() + print('[i] Welcome to the ArchiveBox Shell! Example use:') + print(' print(Page.objects.filter(is_archived=True).count())') + print(' Page.objects.get(url="https://example.com").as_json()') + + print(' Page.objects.get(url="https://example.com").as_json()') + + print(' from archivebox.main import get_invalid_folders') diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py new file mode 100644 index 00000000..a8f28ce1 --- /dev/null +++ b/archivebox/extractors/__init__.py @@ -0,0 +1,105 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional +from datetime import datetime + +from ..index.schema import Link +from ..index import ( + load_link_details, + write_link_details, + patch_main_index, +) +from ..util import enforce_types +from ..cli.logging import ( + log_link_archiving_started, + log_link_archiving_finished, + log_archive_method_started, + log_archive_method_finished, +) + +from .title import should_save_title, save_title +from .favicon import should_save_favicon, save_favicon +from .wget import should_save_wget, save_wget +from .pdf import should_save_pdf, save_pdf +from .screenshot import should_save_screenshot, save_screenshot +from .dom import should_save_dom, save_dom +from .git import should_save_git, save_git +from .media import should_save_media, save_media +from .archive_org import should_save_archive_dot_org, save_archive_dot_org + + +@enforce_types +def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) -> Link: + """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + + ARCHIVE_METHODS = ( + ('title', should_save_title, save_title), + ('favicon', should_save_favicon, save_favicon), + ('wget', should_save_wget, save_wget), + ('pdf', should_save_pdf, save_pdf), + ('screenshot', should_save_screenshot, save_screenshot), + ('dom', should_save_dom, save_dom), + ('git', should_save_git, save_git), + ('media', should_save_media, save_media), + ('archive_org', should_save_archive_dot_org, save_archive_dot_org), + ) + + out_dir = out_dir or link.link_dir + try: + is_new = not os.path.exists(out_dir) + if is_new: + os.makedirs(out_dir) + + link = load_link_details(link, out_dir=out_dir) + log_link_archiving_started(link, out_dir, is_new) + link = link.overwrite(updated=datetime.now()) + stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} + + for method_name, should_run, method_function in ARCHIVE_METHODS: + try: + if method_name not in link.history: + link.history[method_name] = [] + + if should_run(link, out_dir) or overwrite: + log_archive_method_started(method_name) + + result = method_function(link=link, out_dir=out_dir) + + link.history[method_name].append(result) + + stats[result.status] += 1 + log_archive_method_finished(result) + else: + stats['skipped'] += 1 + except Exception as e: + raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( + method_name, + link.url, + )) from e + + # print(' ', stats) + + write_link_details(link, out_dir=link.link_dir) + patch_main_index(link) + + # # If any changes were made, update the main links index json and html + # was_changed = stats['succeeded'] or stats['failed'] + # if was_changed: + # patch_main_index(link) + + log_link_archiving_finished(link, link.link_dir, is_new, stats) + + except KeyboardInterrupt: + try: + write_link_details(link, out_dir=link.link_dir) + except: + pass + raise + + except Exception as err: + print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) + raise + + return link diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py new file mode 100644 index 00000000..ad6d409b --- /dev/null +++ b/archivebox/extractors/archive_org.py @@ -0,0 +1,115 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional, List, Dict, Tuple +from collections import defaultdict + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + run, + PIPE, + DEVNULL, + is_static_file, + ArchiveError, + chmod_file, +) +from ..config import ( + VERSION, + TIMEOUT, + SAVE_ARCHIVE_DOT_ORG, + CURL_BINARY, + CURL_VERSION, + CHECK_SSL_VALIDITY +) + + + +@enforce_types +def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + if os.path.exists(os.path.join(out_dir, 'archive.org.txt')): + # if open(path, 'r').read().strip() != 'None': + return False + + return SAVE_ARCHIVE_DOT_ORG + +@enforce_types +def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """submit site to archive.org for archiving via their service, save returned archive url""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'archive.org.txt' + archive_org_url = None + submit_url = 'https://web.archive.org/save/{}'.format(link.url) + cmd = [ + CURL_BINARY, + '--location', + '--head', + '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from + '--max-time', str(timeout), + *([] if CHECK_SSL_VALIDITY else ['--insecure']), + submit_url, + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout) + content_location, errors = parse_archive_dot_org_response(result.stdout) + if content_location: + archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) + elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: + archive_org_url = None + # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url))) + elif errors: + raise ArchiveError(', '.join(errors)) + else: + raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.') + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + if output and not isinstance(output, Exception): + # instead of writing None when archive.org rejects the url write the + # url to resubmit it to archive.org. This is so when the user visits + # the URL in person, it will attempt to re-archive it, and it'll show the + # nicer error message explaining why the url was rejected if it fails. + archive_org_url = archive_org_url or submit_url + with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f: + f.write(archive_org_url) + chmod_file('archive.org.txt', cwd=out_dir) + output = archive_org_url + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=CURL_VERSION, + output=output, + status=status, + **timer.stats, + ) + +@enforce_types +def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: + # Parse archive.org response headers + headers: Dict[str, List[str]] = defaultdict(list) + + # lowercase all the header names and store in dict + for header in response.splitlines(): + if b':' not in header or not header.strip(): + continue + name, val = header.decode().split(':', 1) + headers[name.lower().strip()].append(val.strip()) + + # Get successful archive url in "content-location" header or any errors + content_location = headers['content-location'] + errors = headers['x-archive-wayback-runtime-error'] + return content_location, errors + diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py new file mode 100644 index 00000000..a002302f --- /dev/null +++ b/archivebox/extractors/dom.py @@ -0,0 +1,73 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + run, + PIPE, + is_static_file, + ArchiveError, + chrome_args, + chmod_file, +) +from ..config import ( + TIMEOUT, + SAVE_DOM, + CHROME_VERSION, +) + + + +@enforce_types +def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + if os.path.exists(os.path.join(out_dir, 'output.html')): + return False + + return SAVE_DOM + +@enforce_types +def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """print HTML of site to file using chrome --dump-html""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'output.html' + output_path = os.path.join(out_dir, str(output)) + cmd = [ + *chrome_args(TIMEOUT=timeout), + '--dump-dom', + link.url + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + with open(output_path, 'w+') as f: + result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout) + + if result.returncode: + hints = result.stderr.decode() + raise ArchiveError('Failed to save DOM', hints) + + chmod_file(output, cwd=out_dir) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=CHROME_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py new file mode 100644 index 00000000..0dff3900 --- /dev/null +++ b/archivebox/extractors/favicon.py @@ -0,0 +1,65 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + domain, + run, + PIPE, + chmod_file, +) +from ..config import ( + TIMEOUT, + SAVE_FAVICON, + CURL_BINARY, + CURL_VERSION, + CHECK_SSL_VALIDITY, +) + + +@enforce_types +def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if os.path.exists(os.path.join(out_dir, 'favicon.ico')): + return False + + return SAVE_FAVICON + +@enforce_types +def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download site favicon from google's favicon api""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'favicon.ico' + cmd = [ + CURL_BINARY, + '--max-time', str(timeout), + '--location', + '--output', str(output), + *([] if CHECK_SSL_VALIDITY else ['--insecure']), + 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + chmod_file(output, cwd=out_dir) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=CURL_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py new file mode 100644 index 00000000..21a86f5e --- /dev/null +++ b/archivebox/extractors/git.py @@ -0,0 +1,94 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + run, + PIPE, + is_static_file, + ArchiveError, + chmod_file, + domain, + extension, + without_query, + without_fragment, +) +from ..config import ( + TIMEOUT, + SAVE_GIT, + GIT_BINARY, + GIT_VERSION, + GIT_DOMAINS, + CHECK_SSL_VALIDITY +) + + + +@enforce_types +def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + if os.path.exists(os.path.join(out_dir, 'git')): + return False + + is_clonable_url = ( + (domain(link.url) in GIT_DOMAINS) + or (extension(link.url) == 'git') + ) + if not is_clonable_url: + return False + + return SAVE_GIT + + +@enforce_types +def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download full site using git""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'git' + output_path = os.path.join(out_dir, str(output)) + os.makedirs(output_path, exist_ok=True) + cmd = [ + GIT_BINARY, + 'clone', + '--mirror', + '--recursive', + *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), + without_query(without_fragment(link.url)), + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + + if result.returncode == 128: + # ignore failed re-download when the folder already exists + pass + elif result.returncode > 0: + hints = 'Got git response code: {}.'.format(result.returncode) + raise ArchiveError('Failed to save git clone', hints) + + chmod_file(output, cwd=out_dir) + + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=GIT_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py new file mode 100644 index 00000000..9fd9a9be --- /dev/null +++ b/archivebox/extractors/media.py @@ -0,0 +1,100 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + run, + PIPE, + is_static_file, + ArchiveError, + chmod_file, +) +from ..config import ( + MEDIA_TIMEOUT, + SAVE_MEDIA, + YOUTUBEDL_BINARY, + YOUTUBEDL_VERSION, + CHECK_SSL_VALIDITY +) + + +@enforce_types +def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + + if is_static_file(link.url): + return False + + if os.path.exists(os.path.join(out_dir, 'media')): + return False + + return SAVE_MEDIA + +@enforce_types +def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: + """Download playlists or individual video, audio, and subtitles using youtube-dl""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'media' + output_path = os.path.join(out_dir, str(output)) + os.makedirs(output_path, exist_ok=True) + cmd = [ + YOUTUBEDL_BINARY, + '--write-description', + '--write-info-json', + '--write-annotations', + '--yes-playlist', + '--write-thumbnail', + '--no-call-home', + '--no-check-certificate', + '--user-agent', + '--all-subs', + '--extract-audio', + '--keep-video', + '--ignore-errors', + '--geo-bypass', + '--audio-format', 'mp3', + '--audio-quality', '320K', + '--embed-thumbnail', + '--add-metadata', + *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), + link.url, + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + chmod_file(output, cwd=out_dir) + if result.returncode: + if (b'ERROR: Unsupported URL' in result.stderr + or b'HTTP Error 404' in result.stderr + or b'HTTP Error 403' in result.stderr + or b'URL could be a direct video link' in result.stderr + or b'Unable to extract container ID' in result.stderr): + # These happen too frequently on non-media pages to warrant printing to console + pass + else: + hints = ( + 'Got youtube-dl response code: {}.'.format(result.returncode), + *result.stderr.decode().split('\n'), + ) + raise ArchiveError('Failed to save media', hints) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=YOUTUBEDL_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py new file mode 100644 index 00000000..e7ade948 --- /dev/null +++ b/archivebox/extractors/pdf.py @@ -0,0 +1,72 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + run, + PIPE, + is_static_file, + ArchiveError, + chrome_args, + chmod_file, +) +from ..config import ( + TIMEOUT, + SAVE_PDF, + CHROME_VERSION, +) + + + +@enforce_types +def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + if os.path.exists(os.path.join(out_dir, 'output.pdf')): + return False + + return SAVE_PDF + + +@enforce_types +def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """print PDF of site to file using chrome --headless""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'output.pdf' + cmd = [ + *chrome_args(TIMEOUT=timeout), + '--print-to-pdf', + link.url, + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + + if result.returncode: + hints = (result.stderr or result.stdout).decode() + raise ArchiveError('Failed to save PDF', hints) + + chmod_file('output.pdf', cwd=out_dir) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=CHROME_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py new file mode 100644 index 00000000..3e211939 --- /dev/null +++ b/archivebox/extractors/screenshot.py @@ -0,0 +1,71 @@ +__package__ = 'archivebox.extractors' + +import os + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + run, + PIPE, + is_static_file, + ArchiveError, + chrome_args, + chmod_file, +) +from ..config import ( + TIMEOUT, + SAVE_SCREENSHOT, + CHROME_VERSION, +) + + + +@enforce_types +def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + if os.path.exists(os.path.join(out_dir, 'screenshot.png')): + return False + + return SAVE_SCREENSHOT + +@enforce_types +def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """take screenshot of site using chrome --headless""" + + out_dir = out_dir or link.link_dir + output: ArchiveOutput = 'screenshot.png' + cmd = [ + *chrome_args(TIMEOUT=timeout), + '--screenshot', + link.url, + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + + if result.returncode: + hints = (result.stderr or result.stdout).decode() + raise ArchiveError('Failed to save screenshot', hints) + + chmod_file(output, cwd=out_dir) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=CHROME_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py new file mode 100644 index 00000000..c8ba9dd3 --- /dev/null +++ b/archivebox/extractors/title.py @@ -0,0 +1,63 @@ +__package__ = 'archivebox.extractors' + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..util import ( + enforce_types, + TimedProgress, + is_static_file, + ArchiveError, + fetch_page_title, +) +from ..config import ( + TIMEOUT, + SAVE_TITLE, + CURL_BINARY, + CURL_VERSION, +) + + +@enforce_types +def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool: + # if link already has valid title, skip it + if link.title and not link.title.lower().startswith('http'): + return False + + if is_static_file(link.url): + return False + + return SAVE_TITLE + +@enforce_types +def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """try to guess the page's title from its content""" + + output: ArchiveOutput = None + cmd = [ + CURL_BINARY, + link.url, + '|', + 'grep', + ' bool: + output_path = wget_output_path(link) + out_dir = out_dir or link.link_dir + if output_path and os.path.exists(os.path.join(out_dir, output_path)): + return False + + return SAVE_WGET + + +@enforce_types +def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download full site using wget""" + + out_dir = out_dir or link.link_dir + if SAVE_WARC: + warc_dir = os.path.join(out_dir, 'warc') + os.makedirs(warc_dir, exist_ok=True) + warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) + + # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html + output: ArchiveOutput = None + cmd = [ + WGET_BINARY, + # '--server-response', # print headers for better error parsing + '--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', 'robots=off', + '--restrict-file-names=windows', + '--timeout={}'.format(timeout), + *([] if SAVE_WARC else ['--timestamping']), + *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []), + *(['--page-requisites'] if SAVE_WGET_REQUISITES else []), + *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []), + *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []), + *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), + *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), + link.url, + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + output = wget_output_path(link) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + files_downloaded = ( + int(output_tail[-1].strip().split(' ', 2)[1] or 0) + if 'Downloaded:' in output_tail[-1] + else 0 + ) + + # Check for common failure cases + if result.returncode > 0 and files_downloaded < 1: + hints = ( + 'Got wget response code: {}.'.format(result.returncode), + *output_tail, + ) + if b'403: Forbidden' in result.stderr: + raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints) + if b'404: Not Found' in result.stderr: + raise ArchiveError('404 Not Found', hints) + if b'ERROR 500: Internal Server Error' in result.stderr: + raise ArchiveError('500 Internal Server Error', hints) + raise ArchiveError('Got an error from the server', hints) + + # chmod_file(output, cwd=out_dir) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=WGET_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/legacy/index.py b/archivebox/index/__init__.py similarity index 51% rename from archivebox/legacy/index.py rename to archivebox/index/__init__.py index 29b355db..d7b6b43e 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/index/__init__.py @@ -1,14 +1,25 @@ -__package__ = 'archivebox.legacy' +__package__ = 'archivebox.index' +import re import os -import json +import shutil +import json as pyjson -from typing import List, Tuple, Optional, Iterable +from itertools import chain +from typing import List, Tuple, Dict, Optional, Iterable from collections import OrderedDict from contextlib import contextmanager -from .schema import Link, ArchiveResult -from .config import ( +from ..parsers import parse_links +from ..util import ( + scheme, + enforce_types, + TimedProgress, + atomic_write, + ExtendedEncoder, +) +from ..config import ( + ARCHIVE_DIR_NAME, SQL_INDEX_FILENAME, JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, @@ -18,26 +29,7 @@ from .config import ( ANSI, stderr, ) -from .storage.html import write_html_main_index, write_html_link_details -from .storage.json import ( - parse_json_main_index, - write_json_main_index, - parse_json_link_details, - write_json_link_details, -) -from .storage.sql import ( - write_sql_main_index, - parse_sql_main_index, -) -from .util import ( - scheme, - enforce_types, - TimedProgress, - atomic_write, - ExtendedEncoder, -) -from .parse import parse_links -from .logs import ( +from ..cli.logging import ( log_indexing_process_started, log_indexing_process_finished, log_indexing_started, @@ -46,6 +38,22 @@ from .logs import ( log_parsing_finished, ) +from .schema import Link, ArchiveResult +from .html import ( + write_html_main_index, + write_html_link_details, +) +from .json import ( + parse_json_main_index, + write_json_main_index, + parse_json_link_details, + write_json_link_details, +) +from .sql import ( + write_sql_main_index, + parse_sql_main_index, +) + ### Link filtering and checking @enforce_types @@ -95,11 +103,11 @@ def merge_links(a: Link, b: Link) -> Link: } for method in all_methods: deduped_jsons = { - json.dumps(result, sort_keys=True, cls=ExtendedEncoder) + pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder) for result in history[method] } history[method] = list(reversed(sorted( - (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons), + (ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons), key=lambda result: result.start_ts, ))) @@ -114,7 +122,7 @@ def merge_links(a: Link, b: Link) -> Link: @enforce_types -def validate_links(links: Iterable[Link]) -> Iterable[Link]: +def validate_links(links: Iterable[Link]) -> List[Link]: links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = sorted_links(links) # deterministically sort the links based on timstamp, url links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls @@ -128,7 +136,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]: stderr(' archivebox help') raise SystemExit(1) - return links + return list(links) @enforce_types @@ -259,23 +267,32 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]: index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: - meta_dict = json.load(f) + meta_dict = pyjson.load(f) meta_dict.pop('links') return meta_dict return None @enforce_types -def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]: +def import_new_links(existing_links: List[Link], + import_path: str, + out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]: + new_links: List[Link] = [] # parse and validate the import file log_parsing_started(import_path) raw_links, parser_name = parse_links(import_path) - new_links = list(validate_links(raw_links)) + new_links = validate_links(raw_links) # merge existing links in out_dir and new links - all_links = list(validate_links(existing_links + new_links)) + all_links = validate_links(existing_links + new_links) + all_link_urls = {link.url for link in existing_links} + + new_links = [ + link for link in new_links + if link.url not in all_link_urls + ] if parser_name: num_parsed = len(raw_links) @@ -345,3 +362,231 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: return merge_links(existing_link, link) return link + + + +LINK_FILTERS = { + 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern), + 'substring': lambda link, pattern: pattern in link.url, + 'regex': lambda link, pattern: bool(re.match(pattern, link.url)), + 'domain': lambda link, pattern: link.domain == pattern, +} + +@enforce_types +def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool: + for pattern in filter_patterns: + try: + if LINK_FILTERS[filter_type](link, pattern): + return True + except Exception: + stderr() + stderr( + f'[X] Got invalid pattern for --filter-type={filter_type}:', + color='red', + ) + stderr(f' {pattern}') + raise SystemExit(2) + + return False + + +def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """indexed links without checking archive status or data directory validity""" + return { + link.link_dir: link + for link in links + } + +def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """indexed links that are archived with a valid data directory""" + return { + link.link_dir: link + for link in filter(is_archived, links) + } + +def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """indexed links that are unarchived with no data directory or an empty data directory""" + return { + link.link_dir: link + for link in filter(is_unarchived, links) + } + +def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs that are expected to exist based on the main index""" + all_folders = {} + + for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): + if entry.is_dir(follow_symlinks=True): + link = None + try: + link = parse_json_link_details(entry.path) + except Exception: + pass + + all_folders[entry.path] = link + + return all_folders + +def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs with a valid index matched to the main index and archived content""" + return { + link.link_dir: link + for link in filter(is_valid, links) + } + +def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" + duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR) + orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR) + corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR) + unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) + return {**duplicate, **orphaned, **corrupted, **unrecognized} + + +def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs that conflict with other directories that have the same link URL or timestamp""" + links = list(links) + by_url = {link.url: 0 for link in links} + by_timestamp = {link.timestamp: 0 for link in links} + + duplicate_folders = {} + + indexed_folders = {link.link_dir for link in links} + data_folders = ( + entry.path + for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)) + if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders + ) + + for path in chain(sorted(indexed_folders), sorted(data_folders)): + link = None + try: + link = parse_json_link_details(path) + except Exception: + pass + + if link: + # link folder has same timestamp as different link folder + by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1 + if by_timestamp[link.timestamp] > 1: + duplicate_folders[path] = link + + # link folder has same url as different link folder + by_url[link.url] = by_url.get(link.url, 0) + 1 + if by_url[link.url] > 1: + duplicate_folders[path] = link + + return duplicate_folders + +def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs that contain a valid index but aren't listed in the main index""" + links = list(links) + indexed_folders = {link.link_dir: link for link in links} + orphaned_folders = {} + + for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): + if entry.is_dir(follow_symlinks=True): + link = None + try: + link = parse_json_link_details(entry.path) + except Exception: + pass + + if link and entry.path not in indexed_folders: + # folder is a valid link data dir with index details, but it's not in the main index + orphaned_folders[entry.path] = link + + return orphaned_folders + +def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs that don't contain a valid index and aren't listed in the main index""" + return { + link.link_dir: link + for link in filter(is_corrupt, links) + } + +def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + """dirs that don't contain recognizable archive data and aren't listed in the main index""" + by_timestamp = {link.timestamp: 0 for link in links} + unrecognized_folders: Dict[str, Optional[Link]] = {} + + for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): + if entry.is_dir(follow_symlinks=True): + index_exists = os.path.exists(os.path.join(entry.path, 'index.json')) + link = None + try: + link = parse_json_link_details(entry.path) + except Exception: + pass + + if index_exists and link is None: + # index exists but it's corrupted or unparseable + unrecognized_folders[entry.path] = link + + elif not index_exists: + # link details index doesn't exist and the folder isn't in the main index + timestamp = entry.path.rsplit('/', 1)[-1] + if timestamp not in by_timestamp: + unrecognized_folders[entry.path] = link + + return unrecognized_folders + + +def is_valid(link: Link) -> bool: + dir_exists = os.path.exists(link.link_dir) + index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json')) + if not dir_exists: + # unarchived links are not included in the valid list + return False + if dir_exists and not index_exists: + return False + if dir_exists and index_exists: + try: + parsed_link = parse_json_link_details(link.link_dir) + return link.url == parsed_link.url + except Exception: + pass + return False + +def is_corrupt(link: Link) -> bool: + if not os.path.exists(link.link_dir): + # unarchived links are not considered corrupt + return False + + if is_valid(link): + return False + + return True + +def is_archived(link: Link) -> bool: + return is_valid(link) and link.is_archived + +def is_unarchived(link: Link) -> bool: + if not os.path.exists(link.link_dir): + return True + return not link.is_archived + + +def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]: + fixed = [] + cant_fix = [] + for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): + if entry.is_dir(follow_symlinks=True): + if os.path.exists(os.path.join(entry.path, 'index.json')): + link = parse_json_link_details(entry.path) + if not link: + continue + + if not entry.path.endswith(f'/{link.timestamp}'): + dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp) + if os.path.exists(dest): + cant_fix.append(entry.path) + else: + shutil.move(entry.path, dest) + fixed.append(dest) + + if link.link_dir != entry.path: + link = link.overwrite(link_dir=entry.path) + write_json_link_details(link, out_dir=entry.path) + + return fixed, cant_fix diff --git a/archivebox/legacy/storage/html.py b/archivebox/index/html.py similarity index 98% rename from archivebox/legacy/storage/html.py rename to archivebox/index/html.py index 545c06de..3cba2bf0 100644 --- a/archivebox/legacy/storage/html.py +++ b/archivebox/index/html.py @@ -1,11 +1,22 @@ -__package__ = 'archivebox.legacy.storage' +__package__ = 'archivebox.index' import os from datetime import datetime from typing import List, Optional, Iterator -from ..schema import Link +from .schema import Link +from ..util import ( + enforce_types, + ts_to_date, + urlencode, + htmlencode, + urldecode, + wget_output_path, + render_template, + atomic_write, + copy_and_overwrite, +) from ..config import ( OUTPUT_DIR, TEMPLATES_DIR, @@ -18,17 +29,6 @@ from ..config import ( ROBOTS_TXT_FILENAME, FAVICON_FILENAME, ) -from ..util import ( - enforce_types, - ts_to_date, - urlencode, - htmlencode, - urldecode, - wget_output_path, - render_template, - atomic_write, - copy_and_overwrite, -) join = lambda *paths: os.path.join(*paths) MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html') diff --git a/archivebox/legacy/storage/json.py b/archivebox/index/json.py similarity index 90% rename from archivebox/legacy/storage/json.py rename to archivebox/index/json.py index 2ec56fbf..4d75d095 100644 --- a/archivebox/legacy/storage/json.py +++ b/archivebox/index/json.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.legacy.storage' +__package__ = 'archivebox.index' import os import sys @@ -7,7 +7,8 @@ import json from datetime import datetime from typing import List, Optional, Iterator -from ..schema import Link, ArchiveResult +from .schema import Link, ArchiveResult +from ..util import enforce_types, atomic_write from ..config import ( VERSION, OUTPUT_DIR, @@ -17,14 +18,11 @@ from ..config import ( JSON_INDEX_FILENAME, ARCHIVE_DIR_NAME, ) -from ..util import ( - enforce_types, - atomic_write, -) + MAIN_INDEX_HEADER = { 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', - 'schema': 'archivebox.legacy.storage.json', + 'schema': 'archivebox.index.json', 'copyright_info': FOOTER_INFO, 'meta': { 'project': 'ArchiveBox', @@ -43,7 +41,7 @@ MAIN_INDEX_HEADER = { @enforce_types def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: - """parse a archive index json file and return the list of links""" + """parse an archive index json file and return the list of links""" index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) if os.path.exists(index_path): @@ -110,4 +108,6 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]: for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): if entry.is_dir(follow_symlinks=True): if os.path.exists(os.path.join(entry.path, 'index.json')): - yield parse_json_link_details(entry.path) + link = parse_json_link_details(entry.path) + if link: + yield link diff --git a/archivebox/legacy/schema.py b/archivebox/index/schema.py similarity index 93% rename from archivebox/legacy/schema.py rename to archivebox/index/schema.py index 1020f294..1cec34b1 100644 --- a/archivebox/legacy/schema.py +++ b/archivebox/index/schema.py @@ -1,3 +1,5 @@ +__package__ = 'archivebox.index' + import os from datetime import datetime @@ -48,7 +50,7 @@ class ArchiveResult: @classmethod def from_json(cls, json_info): - from .util import parse_date + from ..util import parse_date info = { key: val @@ -60,12 +62,12 @@ class ArchiveResult: return cls(**info) def to_json(self, indent=4, sort_keys=True): - from .util import to_json + from ..util import to_json return to_json(self, indent=indent, sort_keys=sort_keys) def to_csv(self, cols=None, ljust: int=0, separator: str=','): - from .util import to_json + from ..util import to_json cols = cols or self.field_names() return separator.join( @@ -115,7 +117,7 @@ class Link: return float(self.timestamp) > float(other.timestamp) def typecheck(self) -> None: - from .config import stderr, ANSI + from ..config import stderr, ANSI try: assert self.schema == self.__class__.__name__ assert isinstance(self.timestamp, str) and self.timestamp @@ -176,7 +178,7 @@ class Link: @classmethod def from_json(cls, json_info): - from .util import parse_date + from ..util import parse_date info = { key: val @@ -200,12 +202,12 @@ class Link: return cls(**info) def to_json(self, indent=4, sort_keys=True): - from .util import to_json + from ..util import to_json return to_json(self, indent=indent, sort_keys=sort_keys) def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','): - from .util import to_json + from ..util import to_json return separator.join( to_json(getattr(self, col), indent=None).ljust(ljust) @@ -218,60 +220,60 @@ class Link: @property def link_dir(self) -> str: - from .config import CONFIG + from ..config import CONFIG return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp) @property def archive_path(self) -> str: - from .config import ARCHIVE_DIR_NAME + from ..config import ARCHIVE_DIR_NAME return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) ### URL Helpers @property def url_hash(self): - from .util import hashurl + from ..util import hashurl return hashurl(self.url) @property def scheme(self) -> str: - from .util import scheme + from ..util import scheme return scheme(self.url) @property def extension(self) -> str: - from .util import extension + from ..util import extension return extension(self.url) @property def domain(self) -> str: - from .util import domain + from ..util import domain return domain(self.url) @property def path(self) -> str: - from .util import path + from ..util import path return path(self.url) @property def basename(self) -> str: - from .util import basename + from ..util import basename return basename(self.url) @property def base_url(self) -> str: - from .util import base_url + from ..util import base_url return base_url(self.url) ### Pretty Printing Helpers @property def bookmarked_date(self) -> Optional[str]: - from .util import ts_to_date + from ..util import ts_to_date return ts_to_date(self.timestamp) if self.timestamp else None @property def updated_date(self) -> Optional[str]: - from .util import ts_to_date + from ..util import ts_to_date return ts_to_date(self.updated) if self.updated else None @property @@ -304,13 +306,13 @@ class Link: @property def is_static(self) -> bool: - from .util import is_static_file + from ..util import is_static_file return is_static_file(self.url) @property def is_archived(self) -> bool: - from .config import ARCHIVE_DIR - from .util import domain + from ..config import ARCHIVE_DIR + from ..util import domain output_paths = ( domain(self.url), @@ -352,7 +354,7 @@ class Link: def canonical_outputs(self) -> Dict[str, Optional[str]]: """predict the expected output paths that should be present after archiving""" - from .util import wget_output_path + from ..util import wget_output_path canonical = { 'index_path': 'index.html', 'favicon_path': 'favicon.ico', diff --git a/archivebox/legacy/storage/sql.py b/archivebox/index/sql.py similarity index 80% rename from archivebox/legacy/storage/sql.py rename to archivebox/index/sql.py index 363be514..942054c2 100644 --- a/archivebox/legacy/storage/sql.py +++ b/archivebox/index/sql.py @@ -1,9 +1,9 @@ -__package__ = 'archivebox.legacy.storage' +__package__ = 'archivebox.index' from io import StringIO from typing import List, Tuple, Iterator -from ..schema import Link +from .schema import Link from ..util import enforce_types from ..config import setup_django, OUTPUT_DIR @@ -25,9 +25,19 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: setup_django(out_dir, check_db=True) from core.models import Page - for link in links: + all_urls = {link.url: link for link in links} + + for page in Page.objects.all(): + if page.url in all_urls: + info = {k: v for k, v in all_urls.pop(page.url)._asdict().items() if k in Page.keys} + Page.objects.update(**info) + else: + page.delete() + + for url, link in all_urls.items(): info = {k: v for k, v in link._asdict().items() if k in Page.keys} - Page.objects.update_or_create(url=link.url, defaults=info) + Page.objects.update_or_create(url=url, defaults=info) + @enforce_types diff --git a/archivebox/legacy/ArchiveBox.conf b/archivebox/legacy/ArchiveBox.conf deleted file mode 100644 index fe7b674c..00000000 --- a/archivebox/legacy/ArchiveBox.conf +++ /dev/null @@ -1,58 +0,0 @@ -# This is the example default configiration file for ArchiveBox. -# -# Copy example config from here into your project's ArchiveBox.conf file, -# DO NOT EDIT THIS FILE DIRECTLY! -# -# See the list of all the possible options. documentation, and examples here: -# https://github.com/pirate/ArchiveBox/wiki/Configuration - -[GENERAL_CONFIG] -OUTPUT_PERMISSIONS = 755 -ONLY_NEW = False -TIMEOUT = 60 -MEDIA_TIMEOUT = 3600 -ACTIVE_THEME = default -FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. -URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$) - -[ARCHIVE_METHOD_TOGGLES] -SAVE_TITLE = True -SAVE_FAVICON = True -SAVE_WGET = True -SAVE_WGET_REQUISITES = True -SAVE_WARC = True -SAVE_PDF = True -SAVE_SCREENSHOT = True -SAVE_DOM = True -SAVE_GIT = True -SAVE_MEDIA = False -SAVE_ARCHIVE_DOT_ORG = True - - -[ARCHIVE_METHOD_OPTIONS] -CHECK_SSL_VALIDITY = True -RESOLUTION = 1440,900 -GIT_DOMAINS = github.com,bitbucket.org,gitlab.com - -CROME_HEADLESS = True -CROME_SANDBOX = True - -COOKIES_FILE = path/to/cookies.txt -CHROME_USER_DATA_DIR = ~/.config/google-chrome/Default - -WGET_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36 -CHROME_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36 - - -[DEPENDENCY_CONFIG] -USE_CURL = True -USE_WGET = True -USE_CHROME = True -USE_YOUTUBEDL = True -USE_GIT = True - -CURL_BINARY = curl -GIT_BINARY = git" -WGET_BINARY = wget -YOUTUBEDL_BINARY = youtube-dl -CHROME_BINARY = chromium diff --git a/archivebox/legacy/__init__.py b/archivebox/legacy/__init__.py deleted file mode 100644 index 2bbcd2fc..00000000 --- a/archivebox/legacy/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__package__ = 'archivebox.legacy' diff --git a/archivebox/legacy/archive_methods.py b/archivebox/legacy/archive_methods.py deleted file mode 100644 index de4f8668..00000000 --- a/archivebox/legacy/archive_methods.py +++ /dev/null @@ -1,694 +0,0 @@ -import os - -from typing import Dict, List, Tuple, Optional -from collections import defaultdict -from datetime import datetime - -from .schema import Link, ArchiveResult, ArchiveOutput -from .index import ( - load_link_details, - write_link_details, - patch_main_index, -) -from .config import ( - CURL_BINARY, - GIT_BINARY, - WGET_BINARY, - YOUTUBEDL_BINARY, - SAVE_FAVICON, - SAVE_TITLE, - SAVE_WGET, - SAVE_WGET_REQUISITES, - SAVE_PDF, - SAVE_SCREENSHOT, - SAVE_DOM, - SAVE_WARC, - SAVE_GIT, - SAVE_MEDIA, - SAVE_ARCHIVE_DOT_ORG, - TIMEOUT, - MEDIA_TIMEOUT, - GIT_DOMAINS, - VERSION, - WGET_USER_AGENT, - CHECK_SSL_VALIDITY, - COOKIES_FILE, - CURL_VERSION, - WGET_VERSION, - CHROME_VERSION, - GIT_VERSION, - YOUTUBEDL_VERSION, - WGET_AUTO_COMPRESSION, -) -from .util import ( - enforce_types, - domain, - extension, - without_query, - without_fragment, - fetch_page_title, - is_static_file, - TimedProgress, - chmod_file, - wget_output_path, - chrome_args, - run, PIPE, DEVNULL, -) -from .logs import ( - log_link_archiving_started, - log_link_archiving_finished, - log_archive_method_started, - log_archive_method_finished, -) - - -class ArchiveError(Exception): - def __init__(self, message, hints=None): - super().__init__(message) - self.hints = hints - - -@enforce_types -def archive_link(link: Link, out_dir: Optional[str]=None) -> Link: - """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - - ARCHIVE_METHODS = ( - ('title', should_save_title, save_title), - ('favicon', should_save_favicon, save_favicon), - ('wget', should_save_wget, save_wget), - ('pdf', should_save_pdf, save_pdf), - ('screenshot', should_save_screenshot, save_screenshot), - ('dom', should_save_dom, save_dom), - ('git', should_save_git, save_git), - ('media', should_save_media, save_media), - ('archive_org', should_save_archive_dot_org, save_archive_dot_org), - ) - - out_dir = out_dir or link.link_dir - try: - is_new = not os.path.exists(out_dir) - if is_new: - os.makedirs(out_dir) - - link = load_link_details(link, out_dir=out_dir) - log_link_archiving_started(link, out_dir, is_new) - link = link.overwrite(updated=datetime.now()) - stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} - - for method_name, should_run, method_function in ARCHIVE_METHODS: - try: - if method_name not in link.history: - link.history[method_name] = [] - - if should_run(link, out_dir): - log_archive_method_started(method_name) - - result = method_function(link=link, out_dir=out_dir) - - link.history[method_name].append(result) - - stats[result.status] += 1 - log_archive_method_finished(result) - else: - stats['skipped'] += 1 - except Exception as e: - raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( - method_name, - link.url, - )) from e - - # print(' ', stats) - - write_link_details(link, out_dir=link.link_dir) - patch_main_index(link) - - # # If any changes were made, update the main links index json and html - # was_changed = stats['succeeded'] or stats['failed'] - # if was_changed: - # patch_main_index(link) - - log_link_archiving_finished(link, link.link_dir, is_new, stats) - - except KeyboardInterrupt: - try: - write_link_details(link, out_dir=link.link_dir) - except: - pass - raise - - except Exception as err: - print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) - raise - - return link - - -### Archive Method Functions - -@enforce_types -def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool: - # if link already has valid title, skip it - if link.title and not link.title.lower().startswith('http'): - return False - - if is_static_file(link.url): - return False - - return SAVE_TITLE - -@enforce_types -def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """try to guess the page's title from its content""" - - output: ArchiveOutput = None - cmd = [ - CURL_BINARY, - link.url, - '|', - 'grep', - ' bool: - out_dir = out_dir or link.link_dir - if os.path.exists(os.path.join(out_dir, 'favicon.ico')): - return False - - return SAVE_FAVICON - -@enforce_types -def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download site favicon from google's favicon api""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'favicon.ico' - cmd = [ - CURL_BINARY, - '--max-time', str(timeout), - '--location', - '--output', str(output), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), - 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) - chmod_file(output, cwd=out_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=CURL_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool: - output_path = wget_output_path(link) - out_dir = out_dir or link.link_dir - if output_path and os.path.exists(os.path.join(out_dir, output_path)): - return False - - return SAVE_WGET - - -@enforce_types -def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download full site using wget""" - - out_dir = out_dir or link.link_dir - if SAVE_WARC: - warc_dir = os.path.join(out_dir, 'warc') - os.makedirs(warc_dir, exist_ok=True) - warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) - - # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html - output: ArchiveOutput = None - cmd = [ - WGET_BINARY, - # '--server-response', # print headers for better error parsing - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - '--restrict-file-names=windows', - '--timeout={}'.format(timeout), - *([] if SAVE_WARC else ['--timestamping']), - *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []), - *(['--page-requisites'] if SAVE_WGET_REQUISITES else []), - *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []), - *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []), - *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), - *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) - output = wget_output_path(link) - - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" - output_tail = [ - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] - if line.strip() - ] - files_downloaded = ( - int(output_tail[-1].strip().split(' ', 2)[1] or 0) - if 'Downloaded:' in output_tail[-1] - else 0 - ) - - # Check for common failure cases - if result.returncode > 0 and files_downloaded < 1: - hints = ( - 'Got wget response code: {}.'.format(result.returncode), - *output_tail, - ) - if b'403: Forbidden' in result.stderr: - raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints) - if b'404: Not Found' in result.stderr: - raise ArchiveError('404 Not Found', hints) - if b'ERROR 500: Internal Server Error' in result.stderr: - raise ArchiveError('500 Internal Server Error', hints) - raise ArchiveError('Got an error from the server', hints) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=WGET_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - if is_static_file(link.url): - return False - - if os.path.exists(os.path.join(out_dir, 'output.pdf')): - return False - - return SAVE_PDF - - -@enforce_types -def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """print PDF of site to file using chrome --headless""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'output.pdf' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--print-to-pdf', - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) - - if result.returncode: - hints = (result.stderr or result.stdout).decode() - raise ArchiveError('Failed to save PDF', hints) - - chmod_file('output.pdf', cwd=out_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=CHROME_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - if is_static_file(link.url): - return False - - if os.path.exists(os.path.join(out_dir, 'screenshot.png')): - return False - - return SAVE_SCREENSHOT - -@enforce_types -def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """take screenshot of site using chrome --headless""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'screenshot.png' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--screenshot', - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) - - if result.returncode: - hints = (result.stderr or result.stdout).decode() - raise ArchiveError('Failed to save screenshot', hints) - - chmod_file(output, cwd=out_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=CHROME_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - if is_static_file(link.url): - return False - - if os.path.exists(os.path.join(out_dir, 'output.html')): - return False - - return SAVE_DOM - -@enforce_types -def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """print HTML of site to file using chrome --dump-html""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'output.html' - output_path = os.path.join(out_dir, str(output)) - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--dump-dom', - link.url - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - with open(output_path, 'w+') as f: - result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout) - - if result.returncode: - hints = result.stderr.decode() - raise ArchiveError('Failed to save DOM', hints) - - chmod_file(output, cwd=out_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=CHROME_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - if is_static_file(link.url): - return False - - if os.path.exists(os.path.join(out_dir, 'git')): - return False - - is_clonable_url = ( - (domain(link.url) in GIT_DOMAINS) - or (extension(link.url) == 'git') - ) - if not is_clonable_url: - return False - - return SAVE_GIT - - -@enforce_types -def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download full site using git""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'git' - output_path = os.path.join(out_dir, str(output)) - os.makedirs(output_path, exist_ok=True) - cmd = [ - GIT_BINARY, - 'clone', - '--mirror', - '--recursive', - *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), - without_query(without_fragment(link.url)), - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - - if result.returncode == 128: - # ignore failed re-download when the folder already exists - pass - elif result.returncode > 0: - hints = 'Got git response code: {}.'.format(result.returncode) - raise ArchiveError('Failed to save git clone', hints) - - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=GIT_VERSION, - output=output, - status=status, - **timer.stats, - ) - - -@enforce_types -def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - - if is_static_file(link.url): - return False - - if os.path.exists(os.path.join(out_dir, 'media')): - return False - - return SAVE_MEDIA - -@enforce_types -def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: - """Download playlists or individual video, audio, and subtitles using youtube-dl""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'media' - output_path = os.path.join(out_dir, str(output)) - os.makedirs(output_path, exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, - '--write-description', - '--write-info-json', - '--write-annotations', - '--yes-playlist', - '--write-thumbnail', - '--no-call-home', - '--no-check-certificate', - '--user-agent', - '--all-subs', - '--extract-audio', - '--keep-video', - '--ignore-errors', - '--geo-bypass', - '--audio-format', 'mp3', - '--audio-quality', '320K', - '--embed-thumbnail', - '--add-metadata', - *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - chmod_file(output, cwd=out_dir) - if result.returncode: - if (b'ERROR: Unsupported URL' in result.stderr - or b'HTTP Error 404' in result.stderr - or b'HTTP Error 403' in result.stderr - or b'URL could be a direct video link' in result.stderr - or b'Unable to extract container ID' in result.stderr): - # These happen too frequently on non-media pages to warrant printing to console - pass - else: - hints = ( - 'Got youtube-dl response code: {}.'.format(result.returncode), - *result.stderr.decode().split('\n'), - ) - raise ArchiveError('Failed to save media', hints) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=YOUTUBEDL_VERSION, - output=output, - status=status, - **timer.stats, - ) - - -@enforce_types -def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - if is_static_file(link.url): - return False - - if os.path.exists(os.path.join(out_dir, 'archive.org.txt')): - # if open(path, 'r').read().strip() != 'None': - return False - - return SAVE_ARCHIVE_DOT_ORG - -@enforce_types -def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """submit site to archive.org for archiving via their service, save returned archive url""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'archive.org.txt' - archive_org_url = None - submit_url = 'https://web.archive.org/save/{}'.format(link.url) - cmd = [ - CURL_BINARY, - '--location', - '--head', - '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from - '--max-time', str(timeout), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), - submit_url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout) - content_location, errors = parse_archive_dot_org_response(result.stdout) - if content_location: - archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) - elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: - archive_org_url = None - # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url))) - elif errors: - raise ArchiveError(', '.join(errors)) - else: - raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.') - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - if output and not isinstance(output, Exception): - # instead of writing None when archive.org rejects the url write the - # url to resubmit it to archive.org. This is so when the user visits - # the URL in person, it will attempt to re-archive it, and it'll show the - # nicer error message explaining why the url was rejected if it fails. - archive_org_url = archive_org_url or submit_url - with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f: - f.write(archive_org_url) - chmod_file('archive.org.txt', cwd=out_dir) - output = archive_org_url - - return ArchiveResult( - cmd=cmd, - pwd=out_dir, - cmd_version=CURL_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: - # Parse archive.org response headers - headers: Dict[str, List[str]] = defaultdict(list) - - # lowercase all the header names and store in dict - for header in response.splitlines(): - if b':' not in header or not header.strip(): - continue - name, val = header.decode().split(':', 1) - headers[name.lower().strip()].append(val.strip()) - - # Get successful archive url in "content-location" header or any errors - content_location = headers['content-location'] - errors = headers['x-archive-wayback-runtime-error'] - return content_location, errors diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py deleted file mode 100644 index 4095fa24..00000000 --- a/archivebox/legacy/main.py +++ /dev/null @@ -1,626 +0,0 @@ -import os -import re -import shutil - -from typing import Dict, List, Optional, Iterable -from itertools import chain - -from .schema import Link -from .util import ( - enforce_types, - TimedProgress, - get_dir_size, - human_readable_size, -) -from .index import ( - links_after_timestamp, - load_main_index, - import_new_links, - write_main_index, -) -from .storage.json import ( - parse_json_main_index, - parse_json_link_details, - parse_json_links_details, -) -from .storage.sql import parse_sql_main_index, get_admins -from .storage.html import parse_html_main_index -from .archive_methods import archive_link -from .config import ( - stderr, - ANSI, - ONLY_NEW, - OUTPUT_DIR, - SOURCES_DIR, - ARCHIVE_DIR, - LOGS_DIR, - CONFIG_FILE, - ARCHIVE_DIR_NAME, - SOURCES_DIR_NAME, - LOGS_DIR_NAME, - STATIC_DIR_NAME, - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, - SQL_INDEX_FILENAME, - ROBOTS_TXT_FILENAME, - FAVICON_FILENAME, - check_dependencies, - check_data_folder, - setup_django, - write_config_file, -) -from .logs import ( - log_archiving_started, - log_archiving_paused, - log_archiving_finished, - log_removal_started, - log_removal_finished, - log_list_started, - log_list_finished, -) - - -ALLOWED_IN_OUTPUT_DIR = { - '.DS_Store', - '.venv', - 'venv', - 'virtualenv', - '.virtualenv', - ARCHIVE_DIR_NAME, - SOURCES_DIR_NAME, - LOGS_DIR_NAME, - STATIC_DIR_NAME, - SQL_INDEX_FILENAME, - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, - ROBOTS_TXT_FILENAME, - FAVICON_FILENAME, -} - - -@enforce_types -def init(): - os.makedirs(OUTPUT_DIR, exist_ok=True) - - is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR) - existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME)) - - if is_empty and not existing_index: - print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI)) - print(f' {OUTPUT_DIR}') - print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) - elif existing_index: - print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI)) - print(f' {OUTPUT_DIR}') - print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) - else: - stderr( - ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n" - " You must run init in a completely empty directory, or an existing data folder.\n\n" - " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n" - " then run and run 'archivebox init' to pick up where you left off.\n\n" - " (Always make sure your data folder is backed up first before updating ArchiveBox)" - ).format(OUTPUT_DIR, **ANSI) - ) - raise SystemExit(1) - - if existing_index: - print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI)) - else: - print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) - - os.makedirs(SOURCES_DIR, exist_ok=True) - print(f' √ {SOURCES_DIR}') - - os.makedirs(ARCHIVE_DIR, exist_ok=True) - print(f' √ {ARCHIVE_DIR}') - - os.makedirs(LOGS_DIR, exist_ok=True) - print(f' √ {LOGS_DIR}') - - write_config_file({}, out_dir=OUTPUT_DIR) - print(f' √ {CONFIG_FILE}') - - if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)): - print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI)) - else: - print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI)) - - setup_django(OUTPUT_DIR, check_db=False) - from django.conf import settings - assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME) - print(f' √ {settings.DATABASE_FILE}') - print() - from .storage.sql import apply_migrations - for migration_line in apply_migrations(OUTPUT_DIR): - print(f' {migration_line}') - - - assert os.path.exists(settings.DATABASE_FILE) - - # from django.contrib.auth.models import User - # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): - # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI)) - # call_command("createsuperuser", interactive=True) - - print() - print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI)) - - all_links = {} - if existing_index: - all_links = { - link.url: link - for link in load_main_index(out_dir=OUTPUT_DIR, warn=False) - } - print(' √ Loaded {} links from existing main index...'.format(len(all_links))) - - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(OUTPUT_DIR) - if link.url not in all_links - } - if orphaned_json_links: - all_links.update(orphaned_json_links) - print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) - - orphaned_sql_links = { - link.url: link - for link in parse_sql_main_index(OUTPUT_DIR) - if link.url not in all_links - } - if orphaned_sql_links: - all_links.update(orphaned_sql_links) - print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI)) - - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(OUTPUT_DIR) - } - orphan_new_links = { - url: link - for url, link in orphaned_data_dir_links.items() - if url not in all_links - } - orphan_duplicates = { - url: link - for url, link in orphaned_data_dir_links.items() - if url in all_links - } - if orphan_new_links: - all_links.update(orphan_new_links) - print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI)) - if orphan_duplicates: - print(' {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI)) - - orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()} - invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items() - if folder not in orphaned_data_dirs - } - if invalid_folders: - print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI)) - - if orphan_duplicates or invalid_folders: - print(' For more information about the link data directories that were skipped, run:') - print(' archivebox info') - print(' archivebox list --status=invalid') - print(' archivebox list --status=orphaned') - print(' archivebox list --status=duplicate') - - - write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR) - - print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) - if existing_index: - print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) - else: - print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI)) - print() - print(' To view your archive index, open:') - print(' {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))) - print() - print(' To add new links, you can run:') - print(" archivebox add 'https://example.com'") - print() - print(' For more usage and examples, run:') - print(' archivebox help') - - -@enforce_types -def info(): - - print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI)) - print(f' {OUTPUT_DIR}/*') - num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.') - size = human_readable_size(num_bytes) - print(f' Size: {size} across {num_files} files') - print() - - links = list(load_main_index(out_dir=OUTPUT_DIR)) - num_json_links = len(links) - num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR)) - num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR)) - num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR)) - users = get_admins().values_list('username', flat=True) - print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})') - print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') - print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})') - print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') - - print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') - - if num_html_links != len(links) or num_sql_links != len(links): - print() - print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI)) - print(' archivebox init') - - if not users: - print() - print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI)) - print(' archivebox manage createsuperuser') - - print() - print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI)) - print(f' {ARCHIVE_DIR}/*') - - num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) - size = human_readable_size(num_bytes) - print(f' Size: {size} across {num_files} files in {num_dirs} directories') - print() - - num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR)) - num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR)) - num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR)) - print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') - print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') - print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - - num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR)) - num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR)) - print() - print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') - - duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR) - orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR) - corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR) - unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) - num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') - print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') - print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') - print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') - print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') - - if num_indexed: - print() - print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI)) - print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)') - - if orphaned: - print() - print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI)) - print(' archivebox init') - - if num_invalid: - print() - print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI)) - print(' archivebox init') - - print() - - - -@enforce_types -def update_archive_data(import_path: Optional[str]=None, - resume: Optional[float]=None, - only_new: bool=False, - index_only: bool=False) -> List[Link]: - """The main ArchiveBox entrancepoint. Everything starts here.""" - - check_dependencies() - check_data_folder() - - # Step 1: Load list of links from the existing index - # merge in and dedupe new links from import_path - all_links: List[Link] = [] - new_links: List[Link] = [] - all_links = load_main_index(out_dir=OUTPUT_DIR) - if import_path: - all_links, new_links = import_new_links(all_links, import_path) - - # Step 2: Write updated index with deduped old and new links back to disk - write_main_index(links=list(all_links), out_dir=OUTPUT_DIR) - - if index_only: - return all_links - - # Step 3: Run the archive methods for each link - links = new_links if ONLY_NEW else all_links - log_archiving_started(len(links), resume) - idx: int = 0 - link: Link = None # type: ignore - try: - for idx, link in enumerate(links_after_timestamp(links, resume)): - archive_link(link, out_dir=link.link_dir) - - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp if link else '0') - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) - - # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=OUTPUT_DIR) - write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) - return all_links - - -LINK_FILTERS = { - 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern), - 'substring': lambda link, pattern: pattern in link.url, - 'regex': lambda link, pattern: bool(re.match(pattern, link.url)), - 'domain': lambda link, pattern: link.domain == pattern, -} - -@enforce_types -def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool: - for pattern in filter_patterns: - if LINK_FILTERS[filter_type](link, pattern): - return True - - return False - - -@enforce_types -def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact', - after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]: - - all_links = load_main_index(out_dir=OUTPUT_DIR) - - for link in all_links: - if after is not None and float(link.timestamp) < after: - continue - if before is not None and float(link.timestamp) > before: - continue - - if filter_patterns: - if link_matches_filter(link, filter_patterns, filter_type): - yield link - else: - yield link - - -@enforce_types -def remove_archive_links(filter_patterns: List[str], filter_type: str='exact', - after: Optional[float]=None, before: Optional[float]=None, - yes: bool=False, delete: bool=False) -> List[Link]: - - check_dependencies() - check_data_folder() - - log_list_started(filter_patterns, filter_type) - timer = TimedProgress(360, prefix=' ') - try: - links = list(list_archive_data( - filter_patterns=filter_patterns, - filter_type=filter_type, - after=after, - before=before, - )) - finally: - timer.end() - - if not len(links): - log_removal_finished(0, 0) - raise SystemExit(1) - - - log_list_finished(links) - log_removal_started(links, yes=yes, delete=delete) - - timer = TimedProgress(360, prefix=' ') - try: - to_keep = [] - all_links = load_main_index(out_dir=OUTPUT_DIR) - for link in all_links: - should_remove = ( - (after is not None and float(link.timestamp) < after) - or (before is not None and float(link.timestamp) > before) - or link_matches_filter(link, filter_patterns, filter_type) - ) - if not should_remove: - to_keep.append(link) - elif should_remove and delete: - shutil.rmtree(link.link_dir) - finally: - timer.end() - - write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True) - log_removal_finished(len(all_links), len(to_keep)) - - return to_keep - - - -def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """indexed links without checking archive status or data directory validity""" - return { - link.link_dir: link - for link in links - } - -def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """indexed links that are archived with a valid data directory""" - return { - link.link_dir: link - for link in filter(is_archived, links) - } - -def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """indexed links that are unarchived with no data directory or an empty data directory""" - return { - link.link_dir: link - for link in filter(is_unarchived, links) - } - -def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that are expected to exist based on the main index""" - all_folders = {} - - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): - if entry.is_dir(follow_symlinks=True): - link = None - try: - link = parse_json_link_details(entry.path) - except Exception: - pass - - all_folders[entry.path] = link - - return all_folders - -def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs with a valid index matched to the main index and archived content""" - return { - link.link_dir: link - for link in filter(is_valid, links) - } - -def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" - duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR) - orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR) - corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR) - unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) - return {**duplicate, **orphaned, **corrupted, **unrecognized} - - -def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that conflict with other directories that have the same link URL or timestamp""" - links = list(links) - by_url = {link.url: 0 for link in links} - by_timestamp = {link.timestamp: 0 for link in links} - - duplicate_folders = {} - - indexed_folders = {link.link_dir for link in links} - data_folders = ( - entry.path - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)) - if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders - ) - - for path in chain(sorted(indexed_folders), sorted(data_folders)): - link = None - try: - link = parse_json_link_details(path) - except Exception: - pass - - if link: - # link folder has same timestamp as different link folder - by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1 - if by_timestamp[link.timestamp] > 1: - duplicate_folders[path] = link - - # link folder has same url as different link folder - by_url[link.url] = by_url.get(link.url, 0) + 1 - if by_url[link.url] > 1: - duplicate_folders[path] = link - - return duplicate_folders - -def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that contain a valid index but aren't listed in the main index""" - links = list(links) - indexed_folders = {link.link_dir: link for link in links} - orphaned_folders = {} - - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): - if entry.is_dir(follow_symlinks=True): - index_exists = os.path.exists(os.path.join(entry.path, 'index.json')) - link = None - try: - link = parse_json_link_details(entry.path) - except Exception: - pass - - if index_exists and entry.path not in indexed_folders: - # folder is a valid link data dir with index details, but it's not in the main index - orphaned_folders[entry.path] = link - - return orphaned_folders - -def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that don't contain a valid index and aren't listed in the main index""" - return { - link.link_dir: link - for link in filter(is_corrupt, links) - } - -def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that don't contain recognizable archive data and aren't listed in the main index""" - by_timestamp = {link.timestamp: 0 for link in links} - unrecognized_folders: Dict[str, Optional[Link]] = {} - - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): - if entry.is_dir(follow_symlinks=True): - index_exists = os.path.exists(os.path.join(entry.path, 'index.json')) - link = None - try: - link = parse_json_link_details(entry.path) - except Exception: - pass - - if index_exists and link is None: - # index exists but it's corrupted or unparseable - unrecognized_folders[entry.path] = link - - elif not index_exists: - # link details index doesn't exist and the folder isn't in the main index - timestamp = entry.path.rsplit('/', 1)[-1] - if timestamp not in by_timestamp: - unrecognized_folders[entry.path] = link - - return unrecognized_folders - - -def is_valid(link: Link) -> bool: - dir_exists = os.path.exists(link.link_dir) - index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json')) - if not dir_exists: - # unarchived links are not included in the valid list - return False - if dir_exists and not index_exists: - return False - if dir_exists and index_exists: - try: - parsed_link = parse_json_link_details(link.link_dir) - return link.url == parsed_link.url - except Exception: - pass - return False - -def is_corrupt(link: Link) -> bool: - if not os.path.exists(link.link_dir): - # unarchived links are not considered corrupt - return False - - if is_valid(link): - return False - - return True - -def is_archived(link: Link) -> bool: - return is_valid(link) and link.is_archived - -def is_unarchived(link: Link) -> bool: - if not os.path.exists(link.link_dir): - return True - return not link.is_archived diff --git a/archivebox/legacy/mypy_django.ini b/archivebox/legacy/mypy_django.ini deleted file mode 100644 index 306e567c..00000000 --- a/archivebox/legacy/mypy_django.ini +++ /dev/null @@ -1,10 +0,0 @@ -[mypy_django_plugin] - -# specify settings module to use for django.conf.settings, this setting -# could also be specified with DJANGO_SETTINGS_MODULE environment variable -# (it also takes priority over config file) -django_settings = core.settings - -# if True, all unknown settings in django.conf.settings will fallback to Any, -# specify it if your settings are loaded dynamically to avoid false positives -ignore_missing_settings = True diff --git a/archivebox/legacy/parse.py b/archivebox/legacy/parse.py deleted file mode 100644 index 49ffa7fd..00000000 --- a/archivebox/legacy/parse.py +++ /dev/null @@ -1,331 +0,0 @@ -""" -Everything related to parsing links from input sources. - -For a list of supported services, see the README.md. -For examples of supported import formats see tests/. - -Link: { - 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', - 'timestamp': '1544212312.4234', - 'title': 'Example.com Page Title', - 'tags': 'abc,def', - 'sources': [ - 'output/sources/ril_export.html', - 'output/sources/getpocket.com-1523422111.txt', - 'output/sources/stdin-234234112312.txt' - ] -} -""" - -import re -import json - -from typing import Tuple, List, IO, Iterable -from datetime import datetime -import xml.etree.ElementTree as etree - -from .config import TIMEOUT -from .util import ( - htmldecode, - str_between, - URL_REGEX, - check_url_parsing_invariants, - TimedProgress, - Link, - enforce_types, -) - - -@enforce_types -def parse_links(source_file: str) -> Tuple[List[Link], str]: - """parse a list of URLs with their metadata from an - RSS feed, bookmarks export, or text file - """ - - check_url_parsing_invariants() - PARSERS = ( - # Specialized parsers - ('Pocket HTML', parse_pocket_html_export), - ('Pinboard RSS', parse_pinboard_rss_export), - ('Shaarli RSS', parse_shaarli_rss_export), - ('Medium RSS', parse_medium_rss_export), - - # General parsers - ('Netscape HTML', parse_netscape_html_export), - ('Generic RSS', parse_rss_export), - ('Generic JSON', parse_json_export), - - # Fallback parser - ('Plain Text', parse_plain_text_export), - ) - timer = TimedProgress(TIMEOUT * 4) - with open(source_file, 'r', encoding='utf-8') as file: - for parser_name, parser_func in PARSERS: - try: - links = list(parser_func(file)) - if links: - timer.end() - return links, parser_name - except Exception as err: # noqa - # Parsers are tried one by one down the list, and the first one - # that succeeds is used. To see why a certain parser was not used - # due to error or format incompatibility, uncomment this line: - # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) - pass - - timer.end() - return [], 'Failed to parse' - - -### Import Parser Functions - -@enforce_types -def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: - """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" - - html_file.seek(0) - pattern = re.compile("^\\s*
  • (.+)
  • ", re.UNICODE) - for line in html_file: - # example line - #
  • example title
  • - match = pattern.search(line) - if match: - url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url - time = datetime.fromtimestamp(float(match.group(2))) - tags = match.group(3) - title = match.group(4).replace(' β€” Readability', '').replace('http://www.readability.com/read?url=', '') - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=tags or '', - sources=[html_file.name], - ) - - -@enforce_types -def parse_json_export(json_file: IO[str]) -> Iterable[Link]: - """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" - - json_file.seek(0) - links = json.load(json_file) - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') - - for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] - if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now().timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() - elif link.get('description'): - title = link['description'].replace(' β€” Readability', '').strip() - elif link.get('name'): - title = link['name'].strip() - - yield Link( - url=htmldecode(url), - timestamp=ts_str, - title=htmldecode(title) or None, - tags=htmldecode(link.get('tags')) or '', - sources=[json_file.name], - ) - - -@enforce_types -def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]: - """Parse RSS XML-format files into links""" - - rss_file.seek(0) - items = rss_file.read().split('') - items = items[1:] if items else [] - for item in items: - # example item: - # - # <![CDATA[How JavaScript works: inside the V8 engine]]> - # Unread - # https://blog.sessionstack.com/how-javascript-works-inside - # https://blog.sessionstack.com/how-javascript-works-inside - # Mon, 21 Aug 2017 14:21:58 -0500 - # - - trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1].strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '', '') - ts_str = str_between(get_row('pubDate'), '', '') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), ' Iterable[Link]: - """Parse Shaarli-specific RSS XML-format files into links""" - - rss_file.seek(0) - entries = rss_file.read().split('')[1:] - for entry in entries: - # example entry: - # - # Aktuelle Trojaner-Welle: Emotet lauert in gefÀlschten Rechnungsmails | heise online - # - # https://demo.shaarli.org/?cEV4vw - # 2019-01-30T06:06:01+00:00 - # 2019-01-30T06:06:01+00:00 - #

    Permalink

    ]]>
    - #
    - - trailing_removed = entry.split('
    ', 1)[0] - leading_removed = trailing_removed.strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] - - title = str_between(get_row('title'), '', '').strip() - url = str_between(get_row('link'), '') - ts_str = str_between(get_row('published'), '', '') - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=None, - sources=[rss_file.name], - ) - - -@enforce_types -def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: - """Parse netscape-format bookmarks export files (produced by all browsers)""" - - html_file.seek(0) - pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE) - for line in html_file: - # example line - #
    example bookmark title - - match = pattern.search(line) - if match: - url = match.group(1) - time = datetime.fromtimestamp(float(match.group(2))) - title = match.group(3).strip() - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=None, - sources=[html_file.name], - ) - - -@enforce_types -def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: - """Parse Pinboard RSS feed files into links""" - - rss_file.seek(0) - root = etree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore - - url = find("{http://purl.org/rss/1.0/}link") - tags = find("{http://purl.org/dc/elements/1.1/}subject") - title = find("{http://purl.org/rss/1.0/}title") - ts_str = find("{http://purl.org/dc/elements/1.1/}date") - - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now() - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=htmldecode(tags) or None, - sources=[rss_file.name], - ) - - -@enforce_types -def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: - """Parse Medium RSS feed files into links""" - - rss_file.seek(0) - root = etree.parse(rss_file).getroot() - items = root.find("channel").findall("item") # type: ignore - for item in items: - url = item.find("link").text # type: ignore - title = item.find("title").text.strip() # type: ignore - ts_str = item.find("pubDate").text # type: ignore - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=None, - sources=[rss_file.name], - ) - - -@enforce_types -def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]: - """Parse raw links from each line in a text file""" - - text_file.seek(0) - for line in text_file.readlines(): - urls = re.findall(URL_REGEX, line) if line.strip() else () - for url in urls: # type: ignore - yield Link( - url=htmldecode(url), - timestamp=str(datetime.now().timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) diff --git a/archivebox/legacy/purge.py b/archivebox/legacy/purge.py deleted file mode 100755 index b36083f0..00000000 --- a/archivebox/legacy/purge.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 - -import re -from argparse import ArgumentParser -from os.path import exists, join -from shutil import rmtree -from typing import List - -from .config import ARCHIVE_DIR, OUTPUT_DIR -from .index import ( - parse_json_links_index, - write_html_links_index, - write_json_links_index, -) - - -def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: - if not exists(join(OUTPUT_DIR, 'index.json')): - exit('index.json is missing; nothing to do') - - compiled = [re.compile(r) for r in regexes] - links = parse_json_links_index(OUTPUT_DIR) - filtered = [] - remaining = [] - - for link in links: - url = link.url - for r in compiled: - if r.search(url): - filtered.append((link, r)) - break - else: - remaining.append(link) - - if not filtered: - exit('Search did not match any entries.') - - print('Filtered out {}/{} urls:'.format(len(filtered), len(links))) - - for link, regex in filtered: - url = link.url - print(' {url} via {regex}'.format(url=url, regex=regex.pattern)) - - if not proceed: - answer = input('Remove {} entries from index? [y/n] '.format( - len(filtered))) - proceed = answer.strip().lower() in ('y', 'yes') - - if not proceed: - exit('Aborted') - - write_json_links_index(OUTPUT_DIR, remaining) - write_html_links_index(OUTPUT_DIR, remaining) - - if delete: - for link, _ in filtered: - data_dir = join(ARCHIVE_DIR, link['timestamp']) - if exists(data_dir): - rmtree(data_dir) - - -if __name__ == '__main__': - p = ArgumentParser('Index purging tool') - p.add_argument( - '--regex', - '-r', - action='append', - help='Regular expression matching URLs to purge', - ) - p.add_argument( - '--delete', - '-d', - action='store_true', - default=False, - help='Delete webpage files from archive', - ) - p.add_argument( - '--yes', - '-y', - action='store_true', - default=False, - help='Do not prompt for confirmation', - ) - - args = p.parse_args() - if args.regex: - cleanup_index(args.regex, proceed=args.yes, delete=args.delete) - else: - p.print_help() diff --git a/archivebox/legacy/storage/__init__.py b/archivebox/legacy/storage/__init__.py deleted file mode 100644 index 40c7f113..00000000 --- a/archivebox/legacy/storage/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__package__ = 'archivebox.legacy.storage' diff --git a/archivebox/main.py b/archivebox/main.py new file mode 100644 index 00000000..501f6efd --- /dev/null +++ b/archivebox/main.py @@ -0,0 +1,1086 @@ +__package__ = 'archivebox' + +import re +import os +import sys +import shutil + +from typing import Dict, List, Optional, Set, Tuple, Iterable, IO + +from crontab import CronTab, CronSlices + +from .cli import ( + list_subcommands, + run_subcommand, + display_first, + meta_cmds, + main_cmds, + archive_cmds, +) +from .index.schema import Link +from .util import ( + enforce_types, + TimedProgress, + get_dir_size, + human_readable_size, + save_stdin_to_sources, + save_file_to_sources, + links_to_csv, + to_json, + folders_to_str, +) +from .index import ( + links_after_timestamp, + load_main_index, + import_new_links, + write_main_index, + link_matches_filter, + get_indexed_folders, + get_archived_folders, + get_unarchived_folders, + get_present_folders, + get_valid_folders, + get_invalid_folders, + get_duplicate_folders, + get_orphaned_folders, + get_corrupted_folders, + get_unrecognized_folders, + fix_invalid_folder_locations, +) +from .index.json import ( + parse_json_main_index, + parse_json_links_details, +) +from .index.sql import parse_sql_main_index, get_admins, apply_migrations +from .index.html import parse_html_main_index +from .extractors import archive_link +from .config import ( + stderr, + ConfigDict, + ANSI, + IS_TTY, + USER, + ARCHIVEBOX_BINARY, + ONLY_NEW, + OUTPUT_DIR, + SOURCES_DIR, + ARCHIVE_DIR, + LOGS_DIR, + CONFIG_FILE, + ARCHIVE_DIR_NAME, + SOURCES_DIR_NAME, + LOGS_DIR_NAME, + STATIC_DIR_NAME, + JSON_INDEX_FILENAME, + HTML_INDEX_FILENAME, + SQL_INDEX_FILENAME, + ROBOTS_TXT_FILENAME, + FAVICON_FILENAME, + check_dependencies, + check_data_folder, + write_config_file, + setup_django, + VERSION, + CODE_LOCATIONS, + EXTERNAL_LOCATIONS, + DATA_LOCATIONS, + DEPENDENCIES, + load_all_config, + CONFIG, + USER_CONFIG, + get_real_name, +) +from .cli.logging import ( + log_archiving_started, + log_archiving_paused, + log_archiving_finished, + log_removal_started, + log_removal_finished, + log_list_started, + log_list_finished, +) + + +ALLOWED_IN_OUTPUT_DIR = { + '.DS_Store', + '.venv', + 'venv', + 'virtualenv', + '.virtualenv', + ARCHIVE_DIR_NAME, + SOURCES_DIR_NAME, + LOGS_DIR_NAME, + STATIC_DIR_NAME, + SQL_INDEX_FILENAME, + JSON_INDEX_FILENAME, + HTML_INDEX_FILENAME, + ROBOTS_TXT_FILENAME, + FAVICON_FILENAME, +} + +def help(out_dir: str=OUTPUT_DIR) -> None: + all_subcommands = list_subcommands() + COMMANDS_HELP_TEXT = '\n '.join( + f'{cmd.ljust(20)} {summary}' + for cmd, summary in all_subcommands.items() + if cmd in meta_cmds + ) + '\n\n ' + '\n '.join( + f'{cmd.ljust(20)} {summary}' + for cmd, summary in all_subcommands.items() + if cmd in main_cmds + ) + '\n\n ' + '\n '.join( + f'{cmd.ljust(20)} {summary}' + for cmd, summary in all_subcommands.items() + if cmd in archive_cmds + ) + '\n\n ' + '\n '.join( + f'{cmd.ljust(20)} {summary}' + for cmd, summary in all_subcommands.items() + if cmd not in display_first + ) + + + if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)): + print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset} + +{lightred}Active data directory:{reset} + {} + +{lightred}Usage:{reset} + archivebox [command] [--help] [--version] [...args] + +{lightred}Commands:{reset} + {} + +{lightred}Example Use:{reset} + mkdir my-archive; cd my-archive/ + archivebox init + archivebox info + + archivebox add https://example.com/some/page + archivebox add --depth=1 ~/Downloads/bookmarks_export.html + + archivebox list --sort=timestamp --csv=timestamp,url,is_archived + archivebox schedule --every=week https://example.com/some/feed.rss + archivebox update --resume=15109948213.123 + +{lightred}Documentation:{reset} + https://github.com/pirate/ArchiveBox/wiki +'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI)) + + else: + print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI)) + print() + print('To import an existing archive (from a previous version of ArchiveBox):') + print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:') + print(' 2. archivebox init') + print() + print('To start a new archive:') + print(' 1. Create an empty directory, then cd into it and run:') + print(' 2. archivebox init') + print() + print('For more information, see the documentation here:') + print(' https://github.com/pirate/ArchiveBox/wiki') + + +def version(quiet: bool=False, out_dir: str=OUTPUT_DIR) -> None: + if quiet: + print(VERSION) + else: + print('ArchiveBox v{}'.format(VERSION)) + print() + + print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) + for name, dependency in DEPENDENCIES.items(): + print_dependency_version(name, dependency) + + print() + print('{white}[i] Code locations:{reset}'.format(**ANSI)) + for name, folder in CODE_LOCATIONS.items(): + print_folder_status(name, folder) + + print() + print('{white}[i] External locations:{reset}'.format(**ANSI)) + for name, folder in EXTERNAL_LOCATIONS.items(): + print_folder_status(name, folder) + + print() + print('{white}[i] Data locations:{reset}'.format(**ANSI)) + for name, folder in DATA_LOCATIONS.items(): + print_folder_status(name, folder) + + print() + check_dependencies() + + +def run(subcommand: str, subcommand_args: Optional[List[str]], stdin: Optional[IO]=None, out_dir: str=OUTPUT_DIR) -> None: + run_subcommand( + subcommand=subcommand, + subcommand_args=subcommand_args, + stdin=stdin, + out_dir=out_dir, + ) + + +def init(out_dir: str=OUTPUT_DIR) -> None: + os.makedirs(out_dir, exist_ok=True) + + is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) + existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)) + + if is_empty and not existing_index: + print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI)) + print(f' {out_dir}') + print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) + elif existing_index: + print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI)) + print(f' {out_dir}') + print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) + else: + stderr( + ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n" + " You must run init in a completely empty directory, or an existing data folder.\n\n" + " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n" + " then run and run 'archivebox init' to pick up where you left off.\n\n" + " (Always make sure your data folder is backed up first before updating ArchiveBox)" + ).format(out_dir, **ANSI) + ) + raise SystemExit(1) + + if existing_index: + print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI)) + else: + print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) + + os.makedirs(SOURCES_DIR, exist_ok=True) + print(f' √ {SOURCES_DIR}') + + os.makedirs(ARCHIVE_DIR, exist_ok=True) + print(f' √ {ARCHIVE_DIR}') + + os.makedirs(LOGS_DIR, exist_ok=True) + print(f' √ {LOGS_DIR}') + + write_config_file({}, out_dir=out_dir) + print(f' √ {CONFIG_FILE}') + + if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)): + print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI)) + else: + print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI)) + + setup_django(out_dir, check_db=False) + from django.conf import settings + assert settings.DATABASE_FILE == os.path.join(out_dir, SQL_INDEX_FILENAME) + print(f' √ {settings.DATABASE_FILE}') + print() + for migration_line in apply_migrations(out_dir): + print(f' {migration_line}') + + + assert os.path.exists(settings.DATABASE_FILE) + + # from django.contrib.auth.models import User + # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): + # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI)) + # call_command("createsuperuser", interactive=True) + + print() + print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) + + all_links: Dict[str, Link] = {} + if existing_index: + all_links = { + link.url: link + for link in load_main_index(out_dir=out_dir, warn=False) + } + print(' √ Loaded {} links from existing main index.'.format(len(all_links))) + + # Links in data folders that dont match their timestamp + fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) + if fixed: + print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) + if cant_fix: + print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) + + # Links in JSON index but not in main index + orphaned_json_links = { + link.url: link + for link in parse_json_main_index(out_dir) + if link.url not in all_links + } + if orphaned_json_links: + all_links.update(orphaned_json_links) + print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) + + # Links in SQL index but not in main index + orphaned_sql_links = { + link.url: link + for link in parse_sql_main_index(out_dir) + if link.url not in all_links + } + if orphaned_sql_links: + all_links.update(orphaned_sql_links) + print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI)) + + # Links in data dir indexes but not in main index + orphaned_data_dir_links = { + link.url: link + for link in parse_json_links_details(out_dir) + if link.url not in all_links + } + if orphaned_data_dir_links: + all_links.update(orphaned_data_dir_links) + print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) + + # Links in invalid/duplicate data dirs + invalid_folders = { + folder: link + for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items() + } + if invalid_folders: + print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) + print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) + print() + print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) + print(' archivebox info') + print(' archivebox list --status=invalid') + + + write_main_index(list(all_links.values()), out_dir=out_dir) + + print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) + if existing_index: + print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) + else: + print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI)) + print() + print(' To view your archive index, open:') + print(' {}'.format(os.path.join(out_dir, HTML_INDEX_FILENAME))) + print() + print(' To add new links, you can run:') + print(" archivebox add 'https://example.com'") + print() + print(' For more usage and examples, run:') + print(' archivebox help') + + +def info(out_dir: str=OUTPUT_DIR) -> None: + check_data_folder(out_dir=out_dir) + + print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI)) + print(f' {out_dir}/*') + num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') + size = human_readable_size(num_bytes) + print(f' Size: {size} across {num_files} files') + print() + + links = list(load_main_index(out_dir=out_dir)) + num_json_links = len(links) + num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir)) + num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir)) + num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) + users = get_admins().values_list('username', flat=True) + print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})') + print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') + print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})') + print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') + + print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') + + if num_html_links != len(links) or num_sql_links != len(links): + print() + print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI)) + print(' archivebox init') + + if not users: + print() + print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI)) + print(' archivebox manage createsuperuser') + + print() + print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI)) + print(f' {ARCHIVE_DIR}/*') + + num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) + size = human_readable_size(num_bytes) + print(f' Size: {size} across {num_files} files in {num_dirs} directories') + print() + + num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) + num_archived = len(get_archived_folders(links, out_dir=out_dir)) + num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) + print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') + print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') + print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') + + num_present = len(get_present_folders(links, out_dir=out_dir)) + num_valid = len(get_valid_folders(links, out_dir=out_dir)) + print() + print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') + print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') + + duplicate = get_duplicate_folders(links, out_dir=out_dir) + orphaned = get_orphaned_folders(links, out_dir=out_dir) + corrupted = get_corrupted_folders(links, out_dir=out_dir) + unrecognized = get_unrecognized_folders(links, out_dir=out_dir) + num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) + print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') + print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') + print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') + print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') + print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') + + if num_indexed: + print() + print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI)) + print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)') + + if orphaned: + print() + print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI)) + print(' archivebox init') + + if num_invalid: + print() + print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI)) + print(' archivebox init') + + print() + + +@enforce_types +def add(import_str: Optional[str]=None, + import_path: Optional[str]=None, + update_all: bool=not ONLY_NEW, + index_only: bool=False, + out_dir: str=OUTPUT_DIR) -> List[Link]: + """The main ArchiveBox entrancepoint. Everything starts here.""" + + check_data_folder(out_dir=out_dir) + + if import_str and import_path: + stderr( + '[X] You should pass either an import path as an argument, ' + 'or pass a list of links via stdin, but not both.\n', + color='red', + ) + raise SystemExit(2) + elif import_str: + import_path = save_stdin_to_sources(import_str, out_dir=out_dir) + else: + import_path = save_file_to_sources(import_path, out_dir=out_dir) + + check_dependencies() + + # Step 1: Load list of links from the existing index + # merge in and dedupe new links from import_path + all_links: List[Link] = [] + new_links: List[Link] = [] + all_links = load_main_index(out_dir=out_dir) + if import_path: + all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir) + + # Step 2: Write updated index with deduped old and new links back to disk + write_main_index(links=all_links, out_dir=out_dir) + + if index_only: + return all_links + + # Step 3: Run the archive methods for each link + links = all_links if update_all else new_links + log_archiving_started(len(links)) + idx: int = 0 + link: Link = None # type: ignore + try: + for idx, link in enumerate(links): + archive_link(link, out_dir=link.link_dir) + + except KeyboardInterrupt: + log_archiving_paused(len(links), idx, link.timestamp if link else '0') + raise SystemExit(0) + + except: + print() + raise + + log_archiving_finished(len(links)) + + # Step 4: Re-write links index with updated titles, icons, and resources + all_links = load_main_index(out_dir=out_dir) + write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + return all_links + +@enforce_types +def remove(filter_str: Optional[str]=None, + filter_patterns: Optional[List[str]]=None, + filter_type: str='exact', + after: Optional[float]=None, + before: Optional[float]=None, + yes: bool=False, + delete: bool=False, + out_dir: str=OUTPUT_DIR) -> List[Link]: + + check_data_folder(out_dir=out_dir) + + if filter_str and filter_patterns: + stderr( + '[X] You should pass either a pattern as an argument, ' + 'or pass a list of patterns via stdin, but not both.\n', + color='red', + ) + raise SystemExit(2) + elif not (filter_str or filter_patterns): + stderr( + '[X] You should pass either a pattern as an argument, ' + 'or pass a list of patterns via stdin.', + color='red', + ) + stderr() + stderr(' {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI)) + stderr(" archivebox remove --filter-type=regex '.*'") + stderr() + raise SystemExit(2) + elif filter_str: + filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] + + log_list_started(filter_patterns, filter_type) + timer = TimedProgress(360, prefix=' ') + try: + links = list(list_links( + filter_patterns=filter_patterns, + filter_type=filter_type, + after=after, + before=before, + )) + finally: + timer.end() + + if not len(links): + log_removal_finished(0, 0) + raise SystemExit(1) + + + log_list_finished(links) + log_removal_started(links, yes=yes, delete=delete) + + timer = TimedProgress(360, prefix=' ') + try: + to_keep = [] + all_links = load_main_index(out_dir=out_dir) + for link in all_links: + should_remove = ( + (after is not None and float(link.timestamp) < after) + or (before is not None and float(link.timestamp) > before) + or link_matches_filter(link, filter_patterns, filter_type) + ) + if not should_remove: + to_keep.append(link) + elif should_remove and delete: + shutil.rmtree(link.link_dir, ignore_errors=True) + finally: + timer.end() + + write_main_index(links=to_keep, out_dir=out_dir, finished=True) + log_removal_finished(len(all_links), len(to_keep)) + + return to_keep + +@enforce_types +def update(resume: Optional[float]=None, + only_new: bool=not ONLY_NEW, + index_only: bool=False, + overwrite: bool=False, + filter_patterns_str: Optional[str]=None, + filter_patterns: Optional[List[str]]=None, + filter_type: Optional[str]=None, + status: Optional[str]=None, + after: Optional[str]=None, + before: Optional[str]=None, + out_dir: str=OUTPUT_DIR) -> List[Link]: + """The main ArchiveBox entrancepoint. Everything starts here.""" + + check_dependencies() + check_data_folder(out_dir=out_dir) + + # Step 1: Load list of links from the existing index + # merge in and dedupe new links from import_path + all_links: List[Link] = [] + new_links: List[Link] = [] + all_links = load_main_index(out_dir=out_dir) + + # Step 2: Write updated index with deduped old and new links back to disk + write_main_index(links=list(all_links), out_dir=out_dir) + + # Step 3: Filter for selected_links + matching_links = list_links( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + ) + matching_folders = list_folders( + links=list(matching_links), + status=status, + out_dir=out_dir, + ) + all_links = [link for link in matching_folders.values() if link] + + if index_only: + return all_links + + # Step 3: Run the archive methods for each link + links = new_links if only_new else all_links + log_archiving_started(len(links), resume) + idx: int = 0 + link: Link = None # type: ignore + try: + for idx, link in enumerate(links_after_timestamp(links, resume)): + archive_link(link, overwrite=overwrite, out_dir=link.link_dir) + + except KeyboardInterrupt: + log_archiving_paused(len(links), idx, link.timestamp if link else '0') + raise SystemExit(0) + + except: + print() + raise + + log_archiving_finished(len(links)) + + # Step 4: Re-write links index with updated titles, icons, and resources + all_links = load_main_index(out_dir=out_dir) + write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + return all_links + +@enforce_types +def list_all(filter_patterns_str: Optional[str]=None, + filter_patterns: Optional[List[str]]=None, + filter_type: str='exact', + status: Optional[str]=None, + after: Optional[float]=None, + before: Optional[float]=None, + sort: Optional[str]=None, + csv: Optional[str]=None, + json: Optional[str]=None, + out_dir: str=OUTPUT_DIR) -> Iterable[Link]: + + check_data_folder(out_dir=out_dir) + + if filter_patterns and filter_patterns_str: + stderr( + '[X] You should either pass filter patterns as an arguments ' + 'or via stdin, but not both.\n', + color='red', + ) + raise SystemExit(2) + elif filter_patterns_str: + filter_patterns = filter_patterns_str.split('\n') + + + links = list_links( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + ) + + if sort: + links = sorted(links, key=lambda link: getattr(link, sort)) + + folders = list_folders( + links=list(links), + status=status, + out_dir=out_dir, + ) + + if csv: + print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True)) + elif json: + print(to_json(folders.values(), indent=4, sort_keys=True)) + else: + print(folders_to_str(folders)) + raise SystemExit(not folders) + + +@enforce_types +def list_links(filter_patterns: Optional[List[str]]=None, + filter_type: str='exact', + after: Optional[float]=None, + before: Optional[float]=None, + out_dir: str=OUTPUT_DIR) -> Iterable[Link]: + + check_data_folder(out_dir=out_dir) + + all_links = load_main_index(out_dir=out_dir) + + for link in all_links: + if after is not None and float(link.timestamp) < after: + continue + if before is not None and float(link.timestamp) > before: + continue + + if filter_patterns: + if link_matches_filter(link, filter_patterns, filter_type): + yield link + else: + yield link + +@enforce_types +def list_folders(links: List[Link], + status: str, + out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + + check_data_folder() + + if status == 'indexed': + return get_indexed_folders(links, out_dir=out_dir) + elif status == 'archived': + return get_archived_folders(links, out_dir=out_dir) + elif status == 'unarchived': + return get_unarchived_folders(links, out_dir=out_dir) + + elif status == 'present': + return get_present_folders(links, out_dir=out_dir) + elif status == 'valid': + return get_valid_folders(links, out_dir=out_dir) + elif status == 'invalid': + return get_invalid_folders(links, out_dir=out_dir) + + elif status == 'duplicate': + return get_duplicate_folders(links, out_dir=out_dir) + elif status == 'orphaned': + return get_orphaned_folders(links, out_dir=out_dir) + elif status == 'corrupted': + return get_corrupted_folders(links, out_dir=out_dir) + elif status == 'unrecognized': + return get_unrecognized_folders(links, out_dir=out_dir) + + raise ValueError('Status not recognized.') + + +def config(config_options_str: Optional[str]=None, + config_options: Optional[List[str]]=None, + get: bool=False, + set: bool=False, + reset: bool=False, + out_dir: str=OUTPUT_DIR) -> None: + + check_data_folder(out_dir=out_dir) + + if config_options and config_options_str: + stderr( + '[X] You should either pass config values as an arguments ' + 'or via stdin, but not both.\n', + color='red', + ) + raise SystemExit(2) + elif config_options_str: + config_options = stdin_raw_text.split('\n') + + config_options = config_options or [] + + no_args = not (get or set or reset or config_options) + + matching_config: ConfigDict = {} + if get or no_args: + if config_options: + config_options = [get_real_name(key) for key in config_options] + matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG} + failed_config = [key for key in config_options if key not in CONFIG] + if failed_config: + stderr() + stderr('[X] These options failed to get', color='red') + stderr(' {}'.format('\n '.join(config_options))) + raise SystemExit(1) + else: + matching_config = CONFIG + + print(printable_config(matching_config)) + raise SystemExit(not matching_config) + elif set: + new_config = {} + failed_options = [] + for line in config_options: + if line.startswith('#') or not line.strip(): + continue + if '=' not in line: + stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') + stderr(f' {line}') + raise SystemExit(2) + + raw_key, val = line.split('=') + raw_key = raw_key.upper().strip() + key = get_real_name(raw_key) + if key != raw_key: + stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') + + if key in CONFIG: + new_config[key] = val.strip() + else: + failed_options.append(line) + + if new_config: + before = CONFIG + matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR) + after = load_all_config() + print(printable_config(matching_config)) + + side_effect_changes: ConfigDict = {} + for key, val in after.items(): + if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config): + side_effect_changes[key] = after[key] + + if side_effect_changes: + stderr() + stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') + print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) + if failed_options: + stderr() + stderr('[X] These options failed to set:', color='red') + stderr(' {}'.format('\n '.join(failed_options))) + raise SystemExit(bool(failed_options)) + elif reset: + stderr('[X] This command is not implemented yet.', color='red') + stderr(' Please manually remove the relevant lines from your config file:') + stderr(f' {CONFIG_FILE}') + raise SystemExit(2) + + else: + stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') + stderr(' archivebox config') + stderr(' archivebox config --get SOME_KEY') + stderr(' archivebox config --set SOME_KEY=SOME_VALUE') + raise SystemExit(2) + + +CRON_COMMENT = 'archivebox_schedule' + +@enforce_types +def schedule(add: bool=False, + show: bool=False, + clear: bool=False, + foreground: bool=False, + run_all: bool=False, + quiet: bool=False, + every: Optional[str]=None, + import_path: Optional[str]=None, + out_dir: str=OUTPUT_DIR): + + check_data_folder(out_dir=out_dir) + + os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True) + + cron = CronTab(user=True) + cron = dedupe_jobs(cron) + + existing_jobs = list(cron.find_comment(CRON_COMMENT)) + if foreground or run_all: + if import_path or (not existing_jobs): + stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI)) + stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml') + raise SystemExit(1) + print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI)) + if run_all: + try: + for job in existing_jobs: + sys.stdout.write(f' > {job.command}') + sys.stdout.flush() + job.run() + sys.stdout.write(f'\r √ {job.command}\n') + except KeyboardInterrupt: + print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) + raise SystemExit(1) + if foreground: + try: + for result in cron.run_scheduler(): + print(result) + except KeyboardInterrupt: + print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) + raise SystemExit(1) + + elif show: + if existing_jobs: + print('\n'.join(str(cmd) for cmd in existing_jobs)) + else: + stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI)) + stderr(' To schedule a new job, run:') + stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml') + raise SystemExit(0) + + elif clear: + print(cron.remove_all(comment=CRON_COMMENT)) + cron.write() + raise SystemExit(0) + + elif every: + quoted = lambda s: f'"{s}"' if s and ' ' in s else s + cmd = [ + 'cd', + quoted(out_dir), + '&&', + quoted(ARCHIVEBOX_BINARY), + *(['add', f'"{import_path}"'] if import_path else ['update']), + '2>&1', + '>', + quoted(os.path.join(LOGS_DIR, 'archivebox.log')), + + ] + new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) + + if every in ('minute', 'hour', 'day', 'week', 'month', 'year'): + set_every = getattr(new_job.every(), every) + set_every() + elif CronSlices.is_valid(every): + new_job.setall(every) + else: + stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI)) + stderr(' It must be one of minute/hour/day/week/month') + stderr(' or a quoted cron-format schedule like:') + stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml') + stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml') + raise SystemExit(1) + + cron = dedupe_jobs(cron) + cron.write() + + total_runs = sum(j.frequency_per_year() for j in cron) + existing_jobs = list(cron.find_comment(CRON_COMMENT)) + + print() + print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI)) + print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) + if total_runs > 60 and not quiet: + stderr() + stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) + stderr(f' Congrats on being an enthusiastic internet archiver! πŸ‘Œ') + stderr() + stderr(' Make sure you have enough storage space available to hold all the data.') + stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') + raise SystemExit(0) + + + + + +def server(runserver_args: Optional[List[str]]=None, reload: bool=False, out_dir: str=OUTPUT_DIR) -> None: + runserver_args = runserver_args or [] + check_data_folder(out_dir=out_dir) + + setup_django(out_dir) + from django.core.management import call_command + from django.contrib.auth.models import User + + if IS_TTY and not User.objects.filter(is_superuser=True).exists(): + print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI)) + print() + print(' To create an admin user, run:') + print(' archivebox manage createsuperuser') + print() + + print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI)) + if not reload: + runserver_args.append('--noreload') + + call_command("runserver", *runserver_args) + + +def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None: + check_data_folder(out_dir=out_dir) + + setup_django(out_dir) + from django.core.management import execute_from_command_line + + execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])]) + +def shell(out_dir: str=OUTPUT_DIR) -> None: + check_data_folder(out_dir=out_dir) + + setup_django(OUTPUT_DIR) + from django.core.management import call_command + call_command("shell_plus") + +# Helpers + +def printable_config(config: ConfigDict, prefix: str='') -> str: + return f'\n{prefix}'.join( + f'{key}={val}' + for key, val in config.items() + if not (isinstance(val, dict) or callable(val)) + ) + +def dedupe_jobs(cron: CronTab) -> CronTab: + deduped: Set[Tuple[str, str]] = set() + + for job in list(cron): + unique_tuple = (str(job.slices), job.command) + if unique_tuple not in deduped: + deduped.add(unique_tuple) + cron.remove(job) + + for schedule, command in deduped: + job = cron.new(command=command, comment=CRON_COMMENT) + job.setall(schedule) + job.enable() + + return cron + + +def print_folder_status(name, folder): + if folder['enabled']: + if folder['is_valid']: + color, symbol, note = 'green', '√', 'valid' + else: + color, symbol, note, num_files = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' + + if folder['path']: + if os.path.exists(folder['path']): + num_files = ( + f'{len(os.listdir(folder["path"]))} files' + if os.path.isdir(folder['path']) else + human_readable_size(os.path.getsize(folder['path'])) + ) + else: + num_files = 'missing' + + if ' ' in folder['path']: + folder['path'] = f'"{folder["path"]}"' + + print( + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(22), + (folder["path"] or '').ljust(76), + num_files.ljust(14), + ANSI[color], + note, + ANSI['reset'], + ) + + +def print_dependency_version(name, dependency): + if dependency['enabled']: + if dependency['is_valid']: + color, symbol, note = 'green', '√', 'valid' + version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0] + else: + color, symbol, note, version = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' + + if ' ' in dependency["path"]: + dependency["path"] = f'"{dependency["path"]}"' + + print( + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(22), + (dependency["path"] or '').ljust(76), + version.ljust(14), + ANSI[color], + note, + ANSI['reset'], + ) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py new file mode 100644 index 00000000..2a20ff6d --- /dev/null +++ b/archivebox/parsers/__init__.py @@ -0,0 +1,68 @@ +""" +Everything related to parsing links from input sources. + +For a list of supported services, see the README.md. +For examples of supported import formats see tests/. +""" + +__package__ = 'archivebox.parsers' + + +from typing import Tuple, List + +from ..config import TIMEOUT +from ..util import ( + check_url_parsing_invariants, + TimedProgress, + Link, + enforce_types, +) +from .pocket_html import parse_pocket_html_export +from .pinboard_rss import parse_pinboard_rss_export +from .shaarli_rss import parse_shaarli_rss_export +from .medium_rss import parse_medium_rss_export +from .netscape_html import parse_netscape_html_export +from .generic_rss import parse_generic_rss_export +from .generic_json import parse_generic_json_export +from .generic_txt import parse_generic_txt_export + + +@enforce_types +def parse_links(source_file: str) -> Tuple[List[Link], str]: + """parse a list of URLs with their metadata from an + RSS feed, bookmarks export, or text file + """ + + check_url_parsing_invariants() + PARSERS = ( + # Specialized parsers + ('Pocket HTML', parse_pocket_html_export), + ('Pinboard RSS', parse_pinboard_rss_export), + ('Shaarli RSS', parse_shaarli_rss_export), + ('Medium RSS', parse_medium_rss_export), + + # General parsers + ('Netscape HTML', parse_netscape_html_export), + ('Generic RSS', parse_generic_rss_export), + ('Generic JSON', parse_generic_json_export), + + # Fallback parser + ('Plain Text', parse_generic_txt_export), + ) + timer = TimedProgress(TIMEOUT * 4) + with open(source_file, 'r', encoding='utf-8') as file: + for parser_name, parser_func in PARSERS: + try: + links = list(parser_func(file)) + if links: + timer.end() + return links, parser_name + except Exception as err: # noqa + # Parsers are tried one by one down the list, and the first one + # that succeeds is used. To see why a certain parser was not used + # due to error or format incompatibility, uncomment this line: + # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) + pass + + timer.end() + return [], 'Failed to parse' diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py new file mode 100644 index 00000000..8b20e6f4 --- /dev/null +++ b/archivebox/parsers/generic_json.py @@ -0,0 +1,65 @@ +__package__ = 'archivebox.parsers' + +import json + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + + +@enforce_types +def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]: + """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" + + json_file.seek(0) + links = json.load(json_file) + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + + for link in links: + # example line + # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] + if link: + # Parse URL + url = link.get('href') or link.get('url') or link.get('URL') + if not url: + raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') + + # Parse the timestamp + ts_str = str(datetime.now().timestamp()) + if link.get('timestamp'): + # chrome/ff histories use a very precise timestamp + ts_str = str(link['timestamp'] / 10000000) + elif link.get('time'): + ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) + elif link.get('created_at'): + ts_str = str(json_date(link['created_at']).timestamp()) + elif link.get('created'): + ts_str = str(json_date(link['created']).timestamp()) + elif link.get('date'): + ts_str = str(json_date(link['date']).timestamp()) + elif link.get('bookmarked'): + ts_str = str(json_date(link['bookmarked']).timestamp()) + elif link.get('saved'): + ts_str = str(json_date(link['saved']).timestamp()) + + # Parse the title + title = None + if link.get('title'): + title = link['title'].strip() + elif link.get('description'): + title = link['description'].replace(' β€” Readability', '').strip() + elif link.get('name'): + title = link['name'].strip() + + yield Link( + url=htmldecode(url), + timestamp=ts_str, + title=htmldecode(title) or None, + tags=htmldecode(link.get('tags')) or '', + sources=[json_file.name], + ) diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py new file mode 100644 index 00000000..3a62bb88 --- /dev/null +++ b/archivebox/parsers/generic_rss.py @@ -0,0 +1,49 @@ +__package__ = 'archivebox.parsers' + + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, + str_between, +) + +@enforce_types +def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]: + """Parse RSS XML-format files into links""" + + rss_file.seek(0) + items = rss_file.read().split('') + items = items[1:] if items else [] + for item in items: + # example item: + # + # <![CDATA[How JavaScript works: inside the V8 engine]]> + # Unread + # https://blog.sessionstack.com/how-javascript-works-inside + # https://blog.sessionstack.com/how-javascript-works-inside + # Mon, 21 Aug 2017 14:21:58 -0500 + # + + trailing_removed = item.split('', 1)[0] + leading_removed = trailing_removed.split('', 1)[-1].strip() + rows = leading_removed.split('\n') + + def get_row(key): + return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] + + url = str_between(get_row('link'), '', '') + ts_str = str_between(get_row('pubDate'), '', '') + time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") + title = str_between(get_row('title'), ' Iterable[Link]: + """Parse raw links from each line in a text file""" + + text_file.seek(0) + for line in text_file.readlines(): + urls = re.findall(URL_REGEX, line) if line.strip() else () + for url in urls: # type: ignore + yield Link( + url=htmldecode(url), + timestamp=str(datetime.now().timestamp()), + title=None, + tags=None, + sources=[text_file.name], + ) diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py new file mode 100644 index 00000000..11379677 --- /dev/null +++ b/archivebox/parsers/medium_rss.py @@ -0,0 +1,35 @@ +__package__ = 'archivebox.parsers' + + +from typing import IO, Iterable +from datetime import datetime + +from xml.etree import ElementTree + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + + +@enforce_types +def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: + """Parse Medium RSS feed files into links""" + + rss_file.seek(0) + root = ElementTree.parse(rss_file).getroot() + items = root.find("channel").findall("item") # type: ignore + for item in items: + url = item.find("link").text # type: ignore + title = item.find("title").text.strip() # type: ignore + ts_str = item.find("pubDate").text # type: ignore + time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore + + yield Link( + url=htmldecode(url), + timestamp=str(time.timestamp()), + title=htmldecode(title) or None, + tags=None, + sources=[rss_file.name], + ) diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py new file mode 100644 index 00000000..894e2318 --- /dev/null +++ b/archivebox/parsers/netscape_html.py @@ -0,0 +1,39 @@ +__package__ = 'archivebox.parsers' + + +import re + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + + +@enforce_types +def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: + """Parse netscape-format bookmarks export files (produced by all browsers)""" + + html_file.seek(0) + pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE) + for line in html_file: + # example line + #
    example bookmark title + + match = pattern.search(line) + if match: + url = match.group(1) + time = datetime.fromtimestamp(float(match.group(2))) + title = match.group(3).strip() + + yield Link( + url=htmldecode(url), + timestamp=str(time.timestamp()), + title=htmldecode(title) or None, + tags=None, + sources=[html_file.name], + ) + diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py new file mode 100644 index 00000000..eb21c7ef --- /dev/null +++ b/archivebox/parsers/pinboard_rss.py @@ -0,0 +1,47 @@ +__package__ = 'archivebox.parsers' + + +from typing import IO, Iterable +from datetime import datetime + +from xml.etree import ElementTree + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + + +@enforce_types +def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: + """Parse Pinboard RSS feed files into links""" + + rss_file.seek(0) + root = ElementTree.parse(rss_file).getroot() + items = root.findall("{http://purl.org/rss/1.0/}item") + for item in items: + find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore + + url = find("{http://purl.org/rss/1.0/}link") + tags = find("{http://purl.org/dc/elements/1.1/}subject") + title = find("{http://purl.org/rss/1.0/}title") + ts_str = find("{http://purl.org/dc/elements/1.1/}date") + + # Pinboard includes a colon in its date stamp timezone offsets, which + # Python can't parse. Remove it: + if ts_str and ts_str[-3:-2] == ":": + ts_str = ts_str[:-3]+ts_str[-2:] + + if ts_str: + time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") + else: + time = datetime.now() + + yield Link( + url=htmldecode(url), + timestamp=str(time.timestamp()), + title=htmldecode(title) or None, + tags=htmldecode(tags) or None, + sources=[rss_file.name], + ) diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py new file mode 100644 index 00000000..3eae58c4 --- /dev/null +++ b/archivebox/parsers/pocket_html.py @@ -0,0 +1,38 @@ +__package__ = 'archivebox.parsers' + + +import re + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + + +@enforce_types +def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: + """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" + + html_file.seek(0) + pattern = re.compile("^\\s*
  • (.+)
  • ", re.UNICODE) + for line in html_file: + # example line + #
  • example title
  • + match = pattern.search(line) + if match: + url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url + time = datetime.fromtimestamp(float(match.group(2))) + tags = match.group(3) + title = match.group(4).replace(' β€” Readability', '').replace('http://www.readability.com/read?url=', '') + + yield Link( + url=htmldecode(url), + timestamp=str(time.timestamp()), + title=htmldecode(title) or None, + tags=tags or '', + sources=[html_file.name], + ) diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py new file mode 100644 index 00000000..ae5bfa96 --- /dev/null +++ b/archivebox/parsers/shaarli_rss.py @@ -0,0 +1,50 @@ +__package__ = 'archivebox.parsers' + + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, + str_between, +) + + +@enforce_types +def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]: + """Parse Shaarli-specific RSS XML-format files into links""" + + rss_file.seek(0) + entries = rss_file.read().split('')[1:] + for entry in entries: + # example entry: + # + # Aktuelle Trojaner-Welle: Emotet lauert in gefÀlschten Rechnungsmails | heise online + # + # https://demo.shaarli.org/?cEV4vw + # 2019-01-30T06:06:01+00:00 + # 2019-01-30T06:06:01+00:00 + #

    Permalink

    ]]>
    + #
    + + trailing_removed = entry.split('
    ', 1)[0] + leading_removed = trailing_removed.strip() + rows = leading_removed.split('\n') + + def get_row(key): + return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] + + title = str_between(get_row('title'), '', '').strip() + url = str_between(get_row('link'), '') + ts_str = str_between(get_row('published'), '', '') + time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") + + yield Link( + url=htmldecode(url), + timestamp=str(time.timestamp()), + title=htmldecode(title) or None, + tags=None, + sources=[rss_file.name], + ) diff --git a/archivebox/legacy/templates/favicon.ico b/archivebox/themes/legacy/favicon.ico similarity index 100% rename from archivebox/legacy/templates/favicon.ico rename to archivebox/themes/legacy/favicon.ico diff --git a/archivebox/legacy/templates/link_details.html b/archivebox/themes/legacy/link_details.html similarity index 100% rename from archivebox/legacy/templates/link_details.html rename to archivebox/themes/legacy/link_details.html diff --git a/archivebox/legacy/templates/main_index.html b/archivebox/themes/legacy/main_index.html similarity index 100% rename from archivebox/legacy/templates/main_index.html rename to archivebox/themes/legacy/main_index.html diff --git a/archivebox/legacy/templates/main_index_row.html b/archivebox/themes/legacy/main_index_row.html similarity index 100% rename from archivebox/legacy/templates/main_index_row.html rename to archivebox/themes/legacy/main_index_row.html diff --git a/archivebox/legacy/templates/robots.txt b/archivebox/themes/legacy/robots.txt similarity index 100% rename from archivebox/legacy/templates/robots.txt rename to archivebox/themes/legacy/robots.txt diff --git a/archivebox/legacy/templates/static/archive.png b/archivebox/themes/legacy/static/archive.png similarity index 100% rename from archivebox/legacy/templates/static/archive.png rename to archivebox/themes/legacy/static/archive.png diff --git a/archivebox/legacy/templates/static/bootstrap.min.css b/archivebox/themes/legacy/static/bootstrap.min.css similarity index 100% rename from archivebox/legacy/templates/static/bootstrap.min.css rename to archivebox/themes/legacy/static/bootstrap.min.css diff --git a/archivebox/legacy/templates/static/external.png b/archivebox/themes/legacy/static/external.png similarity index 100% rename from archivebox/legacy/templates/static/external.png rename to archivebox/themes/legacy/static/external.png diff --git a/archivebox/legacy/templates/static/jquery.dataTables.min.css b/archivebox/themes/legacy/static/jquery.dataTables.min.css similarity index 100% rename from archivebox/legacy/templates/static/jquery.dataTables.min.css rename to archivebox/themes/legacy/static/jquery.dataTables.min.css diff --git a/archivebox/legacy/templates/static/jquery.dataTables.min.js b/archivebox/themes/legacy/static/jquery.dataTables.min.js similarity index 100% rename from archivebox/legacy/templates/static/jquery.dataTables.min.js rename to archivebox/themes/legacy/static/jquery.dataTables.min.js diff --git a/archivebox/legacy/templates/static/jquery.min.js b/archivebox/themes/legacy/static/jquery.min.js similarity index 100% rename from archivebox/legacy/templates/static/jquery.min.js rename to archivebox/themes/legacy/static/jquery.min.js diff --git a/archivebox/legacy/templates/static/sort_asc.png b/archivebox/themes/legacy/static/sort_asc.png similarity index 100% rename from archivebox/legacy/templates/static/sort_asc.png rename to archivebox/themes/legacy/static/sort_asc.png diff --git a/archivebox/legacy/templates/static/sort_both.png b/archivebox/themes/legacy/static/sort_both.png similarity index 100% rename from archivebox/legacy/templates/static/sort_both.png rename to archivebox/themes/legacy/static/sort_both.png diff --git a/archivebox/legacy/templates/static/sort_desc.png b/archivebox/themes/legacy/static/sort_desc.png similarity index 100% rename from archivebox/legacy/templates/static/sort_desc.png rename to archivebox/themes/legacy/static/sort_desc.png diff --git a/archivebox/legacy/templates/static/spinner.gif b/archivebox/themes/legacy/static/spinner.gif similarity index 100% rename from archivebox/legacy/templates/static/spinner.gif rename to archivebox/themes/legacy/static/spinner.gif diff --git a/archivebox/legacy/util.py b/archivebox/util.py similarity index 93% rename from archivebox/legacy/util.py rename to archivebox/util.py index 327f0270..447b9eff 100644 --- a/archivebox/legacy/util.py +++ b/archivebox/util.py @@ -1,6 +1,7 @@ import os import re import sys +import ssl import json import time import shutil @@ -8,7 +9,7 @@ import argparse from string import Template from json import JSONEncoder -from typing import List, Optional, Any, Union, IO, Mapping, Tuple +from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple from inspect import signature from functools import wraps from hashlib import sha256 @@ -28,11 +29,12 @@ from subprocess import ( from base32_crockford import encode as base32_encode # type: ignore -from .schema import Link +from .index.schema import Link from .config import ( ANSI, TERM_WIDTH, - SOURCES_DIR, + OUTPUT_DIR, + SOURCES_DIR_NAME, OUTPUT_PERMISSIONS, TIMEOUT, SHOW_PROGRESS, @@ -40,8 +42,9 @@ from .config import ( CHECK_SSL_VALIDITY, WGET_USER_AGENT, CHROME_OPTIONS, + check_data_folder, ) -from .logs import pretty_path +from .cli.logging import pretty_path ### Parsing Helpers @@ -187,31 +190,36 @@ def check_url_parsing_invariants() -> None: ### Random Helpers @enforce_types -def handle_stdin_import(raw_text: str) -> str: - if not os.path.exists(SOURCES_DIR): - os.makedirs(SOURCES_DIR) +def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: + check_data_folder(out_dir=out_dir) + + sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) ts = str(datetime.now().timestamp()).split('.', 1)[0] - source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts)) + source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts)) atomic_write(raw_text, source_path) return source_path @enforce_types -def handle_file_import(path: str, timeout: int=TIMEOUT) -> str: +def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str: """download a given url's content into output/sources/domain-.txt""" + check_data_folder(out_dir=out_dir) - if not os.path.exists(SOURCES_DIR): - os.makedirs(SOURCES_DIR) + sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) ts = str(datetime.now().timestamp()).split('.', 1)[0] - source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts)) + source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts)) if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): - source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts)) + source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts)) print('{}[*] [{}] Downloading {}{}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), @@ -532,7 +540,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str: if CHECK_SSL_VALIDITY: resp = urlopen(req, timeout=timeout) else: - import ssl insecure = ssl._create_unverified_context() resp = urlopen(req, timeout=timeout, context=insecure) @@ -662,7 +669,7 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) -def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, +def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, header: bool=True, ljust: int=0, separator: str=',') -> str: csv_cols = csv_cols or ['timestamp', 'is_archived', 'url'] @@ -677,6 +684,8 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, return '\n'.join((header_str, *row_strs)) +def folders_to_str(folders: Dict[str, Optional[Link]]) -> str: + return '\n'.join(f'{folder} {link}' for folder, link in folders.items()) @enforce_types def render_template(template_path: str, context: Mapping[str, str]) -> str: @@ -713,11 +722,11 @@ def atomic_write(contents: Union[dict, str, bytes], path: str) -> None: os.remove(tmp_file) -def reject_stdin(caller: str) -> None: +def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: """Tell the user they passed stdin to a command that doesn't accept it""" - if not sys.stdin.isatty(): - stdin_raw_text = sys.stdin.read().strip() + if stdin and not stdin.isatty(): + stdin_raw_text = stdin.read().strip() if stdin_raw_text: print( '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format( @@ -731,9 +740,30 @@ def reject_stdin(caller: str) -> None: print() raise SystemExit(1) +def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]: + if stdin and not stdin.isatty(): + return stdin.read() + return None + + +def set_docstring(text: str): + def decorator(func): + @wraps(func) + def wrapper_with_docstring(*args, **kwargs): + return func(*args, **kwargs) + wrapper_with_docstring.__doc__ = text + return wrapper_with_docstring + return decorator + class SmartFormatter(argparse.HelpFormatter): def _split_lines(self, text, width): if '\n' in text: return text.splitlines() return argparse.HelpFormatter._split_lines(self, text, width) + + +class ArchiveError(Exception): + def __init__(self, message, hints=None): + super().__init__(message) + self.hints = hints