move everything out of legacy folder
|
@ -1,3 +1,6 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
from . import core
|
||||
from . import cli
|
||||
|
||||
from .main import *
|
||||
|
|
|
@ -2,9 +2,14 @@
|
|||
|
||||
__package__ = 'archivebox'
|
||||
|
||||
from .cli.archivebox import main
|
||||
import sys
|
||||
from .cli import archivebox
|
||||
|
||||
|
||||
def main():
|
||||
archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
|
|
|
@ -2,13 +2,17 @@ __package__ = 'archivebox.cli'
|
|||
|
||||
import os
|
||||
|
||||
from typing import Dict
|
||||
from typing import Dict, List, Optional, IO
|
||||
from importlib import import_module
|
||||
|
||||
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# these common commands will appear sorted before any others for ease-of-use
|
||||
display_first = ('help', 'version', 'init', 'info', 'config', 'list', 'update', 'add', 'remove')
|
||||
meta_cmds = ('help', 'version')
|
||||
main_cmds = ('init', 'info', 'config')
|
||||
archive_cmds = ('add', 'remove', 'update', 'list')
|
||||
|
||||
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
|
||||
|
||||
# every imported command module must have these properties in order to be valid
|
||||
required_attrs = ('__package__', '__command__', 'main')
|
||||
|
@ -42,11 +46,14 @@ def list_subcommands() -> Dict[str, str]:
|
|||
return dict(sorted(COMMANDS, key=display_order))
|
||||
|
||||
|
||||
def run_subcommand(subcommand: str, args=None) -> None:
|
||||
def run_subcommand(subcommand: str,
|
||||
subcommand_args: List[str]=None,
|
||||
stdin: Optional[IO]=None,
|
||||
pwd: Optional[str]=None) -> None:
|
||||
"""run a given ArchiveBox subcommand with the given list of args"""
|
||||
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
module.main(args) # type: ignore
|
||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||
|
||||
|
||||
SUBCOMMANDS = list_subcommands()
|
||||
|
|
|
@ -5,19 +5,17 @@ __package__ = 'archivebox.cli'
|
|||
__command__ = 'archivebox'
|
||||
__description__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from . import list_subcommands, run_subcommand
|
||||
from ..legacy.config import OUTPUT_DIR
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def parse_args(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
subcommands = list_subcommands()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -43,54 +41,24 @@ def parse_args(args=None):
|
|||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"args",
|
||||
"subcommand_args",
|
||||
help="Arguments for the subcommand",
|
||||
nargs=argparse.REMAINDER,
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
|
||||
command = parser.parse_args(args)
|
||||
|
||||
if command.help:
|
||||
if command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
if command.version:
|
||||
command.subcommand = 'version'
|
||||
|
||||
# print('--------------------------------------------')
|
||||
# print('Command: ', sys.argv[0])
|
||||
# print('Subcommand: ', command.subcommand)
|
||||
# print('Args to pass:', args[1:])
|
||||
# print('--------------------------------------------')
|
||||
|
||||
return command.subcommand, command.args
|
||||
|
||||
|
||||
def print_import_tutorial():
|
||||
print('Welcome to ArchiveBox!')
|
||||
print()
|
||||
print('To import an existing archive (from a previous version of ArchiveBox):')
|
||||
print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
|
||||
print(' 2. archivebox init')
|
||||
print()
|
||||
print('To start a new archive:')
|
||||
print(' 1. Create an emptry directory, then cd into it and run:')
|
||||
print(' 2. archivebox init')
|
||||
print()
|
||||
print('For more information, see the migration docs here:')
|
||||
print(' https://github.com/pirate/ArchiveBox/wiki/Migration')
|
||||
|
||||
def main(args=None):
|
||||
subcommand, subcommand_args = parse_args(args)
|
||||
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
|
||||
|
||||
if subcommand is None:
|
||||
if existing_index:
|
||||
run_subcommand('help', subcommand_args)
|
||||
else:
|
||||
print_import_tutorial()
|
||||
raise SystemExit(0)
|
||||
|
||||
run_subcommand(subcommand, subcommand_args)
|
||||
run_subcommand(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin,
|
||||
pwd=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,90 +7,75 @@ __description__ = 'Add a new URL or list of URLs to your archive'
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, IO
|
||||
|
||||
from ..legacy.config import stderr, check_dependencies, check_data_folder
|
||||
from ..legacy.util import (
|
||||
handle_stdin_import,
|
||||
handle_file_import,
|
||||
)
|
||||
from ..legacy.main import update_archive_data
|
||||
from ..main import add
|
||||
from ..util import SmartFormatter, accept_stdin
|
||||
from ..config import OUTPUT_DIR, ONLY_NEW
|
||||
|
||||
|
||||
def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
# parser.add_argument(
|
||||
# '--depth', #'-d',
|
||||
# type=int,
|
||||
# help='Recursively archive all linked pages up to this many hops away',
|
||||
# default=0,
|
||||
# )
|
||||
parser.add_argument(
|
||||
'--only-new', #'-n',
|
||||
'--update-all', #'-n',
|
||||
action='store_true',
|
||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
||||
default=not ONLY_NEW,
|
||||
help="Also retry previously skipped/failed links when adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
help="Add the links to the main index without archiving them",
|
||||
)
|
||||
# parser.add_argument(
|
||||
# '--mirror', #'-m',
|
||||
# action='store_true',
|
||||
# help='Archive an entire site (finding all linked pages below it on the same domain)',
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# '--crawler', #'-r',
|
||||
# choices=('depth_first', 'breadth_first'),
|
||||
# help='Controls which crawler to use in order to find outlinks in a given page',
|
||||
# default=None,
|
||||
# )
|
||||
parser.add_argument(
|
||||
'url',
|
||||
'import_path',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default=None,
|
||||
help='URL of page to archive (or path to local file)'
|
||||
help=(
|
||||
'URL or path to local file containing a list of links to import. e.g.:\n'
|
||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
' https://example.com/some/rss/feed.xml\n'
|
||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||
' ~/Desktop/sites_list.csv\n'
|
||||
)
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
|
||||
check_dependencies()
|
||||
|
||||
### Handle ingesting urls piped in through stdin
|
||||
# (.e.g if user does cat example_urls.txt | archivebox add)
|
||||
import_path = None
|
||||
if stdin or not sys.stdin.isatty():
|
||||
stdin_raw_text = stdin or sys.stdin.read()
|
||||
if stdin_raw_text and command.url:
|
||||
stderr(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
import_path = handle_stdin_import(stdin_raw_text)
|
||||
|
||||
### Handle ingesting url from a remote file/feed
|
||||
# (e.g. if an RSS feed URL is used as the import path)
|
||||
elif command.url:
|
||||
import_path = handle_file_import(command.url)
|
||||
|
||||
update_archive_data(
|
||||
import_path=import_path,
|
||||
resume=None,
|
||||
only_new=command.only_new,
|
||||
command = parser.parse_args(args or ())
|
||||
import_str = accept_stdin(stdin)
|
||||
add(
|
||||
import_str=import_str,
|
||||
import_path=command.import_path,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
|
||||
# TODO: Implement these
|
||||
#
|
||||
# parser.add_argument(
|
||||
# '--depth', #'-d',
|
||||
# type=int,
|
||||
# help='Recursively archive all linked pages up to this many hops away',
|
||||
# default=0,
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# '--mirror', #'-m',
|
||||
# action='store_true',
|
||||
# help='Archive an entire site (finding all linked pages below it on the same domain)',
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# '--crawler', #'-r',
|
||||
# choices=('depth_first', 'breadth_first'),
|
||||
# help='Controls which crawler to use in order to find outlinks in a given page',
|
||||
# default=None,
|
||||
# )
|
||||
|
|
|
@ -7,28 +7,14 @@ __description__ = 'Get and set your ArchiveBox project configuration values'
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..legacy.util import SmartFormatter
|
||||
from ..legacy.config import (
|
||||
check_data_folder,
|
||||
OUTPUT_DIR,
|
||||
load_all_config,
|
||||
write_config_file,
|
||||
CONFIG,
|
||||
CONFIG_FILE,
|
||||
USER_CONFIG,
|
||||
ConfigDict,
|
||||
stderr,
|
||||
get_real_name,
|
||||
)
|
||||
from ..main import config
|
||||
from ..util import SmartFormatter, accept_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -57,102 +43,18 @@ def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
|
|||
type=str,
|
||||
help='KEY or KEY=VALUE formatted config values to get or set',
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
command = parser.parse_args(args or ())
|
||||
config_options_str = accept_stdin(stdin)
|
||||
|
||||
if stdin or not sys.stdin.isatty():
|
||||
stdin_raw_text = stdin or sys.stdin.read()
|
||||
if stdin_raw_text and command.config_options:
|
||||
stderr(
|
||||
'[X] You should either pass config values as an arguments '
|
||||
'or via stdin, but not both.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
config_options = stdin_raw_text.split('\n')
|
||||
else:
|
||||
config_options = command.config_options
|
||||
|
||||
no_args = not (command.get or command.set or command.reset or command.config_options)
|
||||
|
||||
matching_config: ConfigDict = {}
|
||||
if command.get or no_args:
|
||||
if config_options:
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
|
||||
failed_config = [key for key in config_options if key not in CONFIG]
|
||||
if failed_config:
|
||||
stderr()
|
||||
stderr('[X] These options failed to get', color='red')
|
||||
stderr(' {}'.format('\n '.join(config_options)))
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = CONFIG
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
elif command.set:
|
||||
new_config = {}
|
||||
failed_options = []
|
||||
for line in config_options:
|
||||
if line.startswith('#') or not line.strip():
|
||||
continue
|
||||
if '=' not in line:
|
||||
stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
|
||||
stderr(f' {line}')
|
||||
raise SystemExit(2)
|
||||
|
||||
raw_key, val = line.split('=')
|
||||
raw_key = raw_key.upper().strip()
|
||||
key = get_real_name(raw_key)
|
||||
if key != raw_key:
|
||||
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
|
||||
|
||||
if key in CONFIG:
|
||||
new_config[key] = val.strip()
|
||||
else:
|
||||
failed_options.append(line)
|
||||
|
||||
if new_config:
|
||||
before = CONFIG
|
||||
matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
|
||||
after = load_all_config()
|
||||
print(printable_config(matching_config))
|
||||
|
||||
side_effect_changes: ConfigDict = {}
|
||||
for key, val in after.items():
|
||||
if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
|
||||
side_effect_changes[key] = after[key]
|
||||
|
||||
if side_effect_changes:
|
||||
stderr()
|
||||
stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
|
||||
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
|
||||
if failed_options:
|
||||
stderr()
|
||||
stderr('[X] These options failed to set:', color='red')
|
||||
stderr(' {}'.format('\n '.join(failed_options)))
|
||||
raise SystemExit(bool(failed_options))
|
||||
elif command.reset:
|
||||
stderr('[X] This command is not implemented yet.', color='red')
|
||||
stderr(' Please manually remove the relevant lines from your config file:')
|
||||
stderr(f' {CONFIG_FILE}')
|
||||
raise SystemExit(2)
|
||||
|
||||
else:
|
||||
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
|
||||
stderr(' archivebox config')
|
||||
stderr(' archivebox config --get SOME_KEY')
|
||||
stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
def printable_config(config: ConfigDict, prefix: str='') -> str:
|
||||
return f'\n{prefix}'.join(
|
||||
f'{key}={val}'
|
||||
for key, val in config.items()
|
||||
if not (isinstance(val, dict) or callable(val))
|
||||
config(
|
||||
config_options_str=config_options_str,
|
||||
config_options=command.config_options,
|
||||
get=command.get,
|
||||
set=command.set,
|
||||
reset=command.reset,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,52 +7,24 @@ __description__ = 'Print the ArchiveBox help message and usage'
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.config import ANSI
|
||||
from . import list_subcommands
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import help
|
||||
from ..util import reject_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
|
||||
COMMANDS_HELP_TEXT = '\n '.join(
|
||||
f'{cmd.ljust(20)} {summary}'
|
||||
for cmd, summary in list_subcommands().items()
|
||||
)
|
||||
|
||||
print('''{green}ArchiveBox: The self-hosted internet archive.{reset}
|
||||
|
||||
{lightblue}Usage:{reset}
|
||||
archivebox [command] [--help] [--version] [...args]
|
||||
|
||||
{lightblue}Comamnds:{reset}
|
||||
{}
|
||||
|
||||
{lightblue}Example Use:{reset}
|
||||
mkdir my-archive; cd my-archive/
|
||||
archivebox init
|
||||
archivebox info
|
||||
|
||||
archivebox add https://example.com/some/page
|
||||
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
|
||||
|
||||
archivebox list --sort=timestamp --csv=timestamp,url,is_archived
|
||||
archivebox schedule --every=week https://example.com/some/feed.rss
|
||||
archivebox update --resume=15109948213.123
|
||||
|
||||
{lightblue}Documentation:{reset}
|
||||
https://github.com/pirate/ArchiveBox/wiki
|
||||
'''.format(COMMANDS_HELP_TEXT, **ANSI))
|
||||
help(out_dir=pwd or OUTPUT_DIR)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,25 +7,24 @@ __description__ = 'Print out some info and statistics about the archive collecti
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.config import check_data_folder
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.main import info
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import info
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..util import reject_stdin
|
||||
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
info(out_dir=pwd or OUTPUT_DIR)
|
||||
|
||||
info()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,23 +7,24 @@ __description__ = 'Initialize a new ArchiveBox collection in the current directo
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.main import init
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import init
|
||||
from ..util import reject_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
init()
|
||||
init(out_dir=pwd or OUTPUT_DIR)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -2,15 +2,17 @@
|
|||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
__description__ = 'List all the URLs currently in the archive.'
|
||||
__description__ = 'List, filter, and export information about archive entries'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv
|
||||
from ..legacy.config import check_data_folder, OUTPUT_DIR
|
||||
from ..legacy.main import (
|
||||
list_archive_data,
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import list_all
|
||||
from ..util import SmartFormatter, accept_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..index import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
|
@ -23,11 +25,7 @@ from ..legacy.main import (
|
|||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -93,57 +91,27 @@ def main(args=None):
|
|||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'patterns',
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='List only URLs matching these filter patterns.'
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
links = list_archive_data(
|
||||
filter_patterns=command.patterns,
|
||||
list_all(
|
||||
filter_patterns_str=filter_patterns_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
before=command.before,
|
||||
status=command.status,
|
||||
after=command.after,
|
||||
before=command.before,
|
||||
sort=command.sort,
|
||||
csv=command.csv,
|
||||
json=command.json,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
if command.sort:
|
||||
links = sorted(links, key=lambda link: getattr(link, command.sort))
|
||||
|
||||
links = list(links)
|
||||
|
||||
if command.status == 'indexed':
|
||||
folders = get_indexed_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'archived':
|
||||
folders = get_archived_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'unarchived':
|
||||
folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR)
|
||||
|
||||
elif command.status == 'present':
|
||||
folders = get_present_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'valid':
|
||||
folders = get_valid_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'invalid':
|
||||
folders = get_invalid_folders(links, out_dir=OUTPUT_DIR)
|
||||
|
||||
elif command.status == 'duplicate':
|
||||
folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'orphaned':
|
||||
folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'corrupted':
|
||||
folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
|
||||
elif command.status == 'unrecognized':
|
||||
folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
|
||||
|
||||
if command.csv:
|
||||
print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True))
|
||||
elif command.json:
|
||||
print(to_json(folders.values(), indent=4, sort_keys=True))
|
||||
else:
|
||||
print('\n'.join(f'{folder} {link}' for folder, link in folders.items()))
|
||||
raise SystemExit(not folders)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -6,24 +6,18 @@ __description__ = 'Run an ArchiveBox Django management command'
|
|||
|
||||
import sys
|
||||
|
||||
from ..legacy.config import OUTPUT_DIR, setup_django, check_data_folder
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import manage
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
setup_django(OUTPUT_DIR)
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
args = sys.argv if args is None else ['archivebox', *args]
|
||||
|
||||
args[0] = f'{sys.argv[0]} manage'
|
||||
|
||||
if args[1:] == []:
|
||||
args.append('help')
|
||||
|
||||
execute_from_command_line(args)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
manage(
|
||||
args=args,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,17 +7,14 @@ __description__ = 'Remove the specified URLs from the archive.'
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..legacy.config import check_data_folder
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.main import remove_archive_links
|
||||
from ..main import remove
|
||||
from ..util import accept_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -56,33 +53,25 @@ def main(args=None):
|
|||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'pattern',
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='URLs matching this filter pattern will be removed from the index.'
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_str = accept_stdin(stdin)
|
||||
|
||||
if not sys.stdin.isatty():
|
||||
stdin_raw_text = sys.stdin.read()
|
||||
if stdin_raw_text and command.url:
|
||||
print(
|
||||
'[X] You should pass either a pattern as an argument, '
|
||||
'or pass a list of patterns via stdin, but not both.\n'
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
|
||||
else:
|
||||
patterns = command.pattern
|
||||
|
||||
remove_archive_links(
|
||||
filter_patterns=patterns, filter_type=command.filter_type,
|
||||
before=command.before, after=command.after,
|
||||
yes=command.yes, delete=command.delete,
|
||||
remove(
|
||||
filter_str=filter_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
before=command.before,
|
||||
after=command.after,
|
||||
yes=command.yes,
|
||||
delete=command.delete,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -4,34 +4,17 @@ __package__ = 'archivebox.cli'
|
|||
__command__ = 'archivebox schedule'
|
||||
__description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from datetime import datetime
|
||||
from crontab import CronTab, CronSlices
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import schedule
|
||||
from ..util import reject_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.config import (
|
||||
OUTPUT_DIR,
|
||||
LOGS_DIR,
|
||||
ARCHIVEBOX_BINARY,
|
||||
USER,
|
||||
ANSI,
|
||||
stderr,
|
||||
check_data_folder,
|
||||
)
|
||||
|
||||
|
||||
CRON_COMMENT = 'archivebox_schedule'
|
||||
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -57,7 +40,7 @@ def main(args=None):
|
|||
group.add_argument(
|
||||
'--clear', # '-c'
|
||||
action='store_true',
|
||||
help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
|
||||
help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
|
||||
)
|
||||
group.add_argument(
|
||||
'--show', # '-s'
|
||||
|
@ -67,13 +50,14 @@ def main(args=None):
|
|||
group.add_argument(
|
||||
'--foreground', '-f',
|
||||
action='store_true',
|
||||
help=("Launch ArchiveBox as a long-running foreground task "
|
||||
help=("Launch ArchiveBox scheduler as a long-running foreground task "
|
||||
"instead of using cron."),
|
||||
)
|
||||
group.add_argument(
|
||||
'--run-all', # '-a',
|
||||
action='store_true',
|
||||
help='Run all the scheduled jobs once immediately, independent of their configured schedules',
|
||||
help=("Run all the scheduled jobs once immediately, independent of "
|
||||
"their configured schedules, can be used together with --foreground"),
|
||||
)
|
||||
parser.add_argument(
|
||||
'import_path',
|
||||
|
@ -83,115 +67,21 @@ def main(args=None):
|
|||
help=("Check this path and import any new links on every run "
|
||||
"(can be either local file or remote URL)"),
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
os.makedirs(LOGS_DIR, exist_ok=True)
|
||||
|
||||
cron = CronTab(user=True)
|
||||
cron = dedupe_jobs(cron)
|
||||
|
||||
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||
if command.foreground or command.run_all:
|
||||
if command.import_path or (not existing_jobs):
|
||||
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
|
||||
stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
|
||||
raise SystemExit(1)
|
||||
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
|
||||
if command.run_all:
|
||||
try:
|
||||
for job in existing_jobs:
|
||||
sys.stdout.write(f' > {job.command}')
|
||||
sys.stdout.flush()
|
||||
job.run()
|
||||
sys.stdout.write(f'\r √ {job.command}\n')
|
||||
except KeyboardInterrupt:
|
||||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
||||
raise SystemExit(1)
|
||||
if command.foreground:
|
||||
try:
|
||||
for result in cron.run_scheduler():
|
||||
print(result)
|
||||
except KeyboardInterrupt:
|
||||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
||||
raise SystemExit(1)
|
||||
|
||||
elif command.show:
|
||||
if existing_jobs:
|
||||
print('\n'.join(str(cmd) for cmd in existing_jobs))
|
||||
else:
|
||||
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
|
||||
stderr(' To schedule a new job, run:')
|
||||
stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
|
||||
raise SystemExit(0)
|
||||
|
||||
elif command.clear:
|
||||
print(cron.remove_all(comment=CRON_COMMENT))
|
||||
cron.write()
|
||||
raise SystemExit(0)
|
||||
|
||||
elif command.every:
|
||||
quoted = lambda s: f'"{s}"' if s and ' ' in s else s
|
||||
cmd = [
|
||||
'cd',
|
||||
quoted(OUTPUT_DIR),
|
||||
'&&',
|
||||
quoted(ARCHIVEBOX_BINARY),
|
||||
*(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
|
||||
'2>&1',
|
||||
'>',
|
||||
quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
|
||||
|
||||
]
|
||||
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
|
||||
|
||||
if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
|
||||
set_every = getattr(new_job.every(), command.every)
|
||||
set_every()
|
||||
elif CronSlices.is_valid(command.every):
|
||||
new_job.setall(command.every)
|
||||
else:
|
||||
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
|
||||
stderr(' It must be one of minute/hour/day/week/month')
|
||||
stderr(' or a quoted cron-format schedule like:')
|
||||
stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
|
||||
stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
|
||||
raise SystemExit(1)
|
||||
|
||||
cron = dedupe_jobs(cron)
|
||||
cron.write()
|
||||
|
||||
total_runs = sum(j.frequency_per_year() for j in cron)
|
||||
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||
|
||||
print()
|
||||
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
|
||||
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
|
||||
if total_runs > 60 and not command.quiet:
|
||||
stderr()
|
||||
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
|
||||
stderr(f' Congrats on being an enthusiastic internet archiver! 👌')
|
||||
stderr()
|
||||
stderr(' Make sure you have enough storage space available to hold all the data.')
|
||||
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
|
||||
raise SystemExit(0)
|
||||
|
||||
|
||||
def dedupe_jobs(cron: CronTab) -> CronTab:
|
||||
deduped = set()
|
||||
for job in list(cron):
|
||||
unique_tuple = (str(job.slices), job.command)
|
||||
if unique_tuple not in deduped:
|
||||
deduped.add(unique_tuple)
|
||||
cron.remove(job)
|
||||
|
||||
for schedule, command in deduped:
|
||||
job = cron.new(command=command, comment=CRON_COMMENT)
|
||||
job.setall(schedule)
|
||||
job.enable()
|
||||
|
||||
return cron
|
||||
schedule(
|
||||
add=command.add,
|
||||
show=command.show,
|
||||
clear=command.clear,
|
||||
foreground=command.foreground,
|
||||
run_all=command.run_all,
|
||||
quiet=command.quiet,
|
||||
every=command.every,
|
||||
import_path=command.import_path,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,15 +7,14 @@ __description__ = 'Run the ArchiveBox HTTP server'
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.config import setup_django, IS_TTY, OUTPUT_DIR, ANSI, check_data_folder
|
||||
from ..legacy.util import reject_stdin
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import server
|
||||
from ..util import reject_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -33,26 +32,15 @@ def main(args=None):
|
|||
action='store_true',
|
||||
help='Enable auto-reloading when code or templates change',
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
setup_django(OUTPUT_DIR)
|
||||
from django.core.management import call_command
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
||||
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
|
||||
print()
|
||||
print(' To create an admin user, run:')
|
||||
print(' archivebox manage createsuperuser')
|
||||
print()
|
||||
|
||||
print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
|
||||
if not command.reload:
|
||||
command.runserver_args.append('--noreload')
|
||||
|
||||
call_command("runserver", *command.runserver_args)
|
||||
server(
|
||||
runserver_args=command.runserver_args,
|
||||
reload=command.reload,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -7,27 +7,26 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
|
|||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder
|
||||
from ..legacy.util import reject_stdin
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import shell
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..util import reject_stdin
|
||||
|
||||
|
||||
def main(args=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
setup_django(OUTPUT_DIR)
|
||||
from django.core.management import call_command
|
||||
call_command("shell_plus")
|
||||
shell(
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -2,27 +2,36 @@
|
|||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox update'
|
||||
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
|
||||
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import List
|
||||
from typing import List, Optional, IO
|
||||
|
||||
from ..legacy.config import check_data_folder
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.main import update_archive_data
|
||||
from ..main import update
|
||||
from ..util import SmartFormatter, accept_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..index import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
|
||||
def main(args: List[str]=None):
|
||||
check_data_folder()
|
||||
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--only-new', #'-n',
|
||||
|
@ -40,16 +49,75 @@ def main(args: List[str]=None):
|
|||
help='Resume the update process from a given timestamp',
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--overwrite', #'-x',
|
||||
action='store_true',
|
||||
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="Update only links bookmarked before the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="Update only links bookmarked after the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--status',
|
||||
type=str,
|
||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
||||
default='indexed',
|
||||
help=(
|
||||
'Update only links or data directories that have the given status\n'
|
||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
||||
f' archived {get_archived_folders.__doc__}\n'
|
||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' present {get_present_folders.__doc__}\n'
|
||||
f' valid {get_valid_folders.__doc__}\n'
|
||||
f' invalid {get_invalid_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='List only URLs matching these filter patterns.'
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
update_archive_data(
|
||||
import_path=None,
|
||||
update(
|
||||
resume=command.resume,
|
||||
only_new=command.only_new,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
filter_patterns_str=filter_patterns_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
status=command.status,
|
||||
after=command.after,
|
||||
before=command.before,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -4,26 +4,17 @@ __package__ = 'archivebox.cli'
|
|||
__command__ = 'archivebox version'
|
||||
__description__ = 'Print the ArchiveBox version and dependency information'
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import reject_stdin, human_readable_size
|
||||
from ..legacy.config import (
|
||||
ANSI,
|
||||
VERSION,
|
||||
CODE_LOCATIONS,
|
||||
CONFIG_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
DEPENDENCIES,
|
||||
check_dependencies,
|
||||
)
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import version
|
||||
from ..util import reject_stdin
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
|
@ -34,92 +25,14 @@ def main(args=None):
|
|||
action='store_true',
|
||||
help='Only print ArchiveBox version number and nothing else.',
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
if command.quiet:
|
||||
print(VERSION)
|
||||
else:
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
for name, dependency in DEPENDENCIES.items():
|
||||
print_dependency_version(name, dependency)
|
||||
|
||||
print()
|
||||
print('{white}[i] Code locations:{reset}'.format(**ANSI))
|
||||
for name, folder in CODE_LOCATIONS.items():
|
||||
print_folder_status(name, folder)
|
||||
|
||||
print()
|
||||
print('{white}[i] Config locations:{reset}'.format(**ANSI))
|
||||
for name, folder in CONFIG_LOCATIONS.items():
|
||||
print_folder_status(name, folder)
|
||||
|
||||
print()
|
||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||
for name, folder in DATA_LOCATIONS.items():
|
||||
print_folder_status(name, folder)
|
||||
|
||||
print()
|
||||
check_dependencies()
|
||||
|
||||
|
||||
def print_folder_status(name, folder):
|
||||
if folder['enabled']:
|
||||
if folder['is_valid']:
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
else:
|
||||
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
if folder['path']:
|
||||
if os.path.exists(folder['path']):
|
||||
num_files = (
|
||||
f'{len(os.listdir(folder["path"]))} files'
|
||||
if os.path.isdir(folder['path']) else
|
||||
human_readable_size(os.path.getsize(folder['path']))
|
||||
)
|
||||
else:
|
||||
num_files = 'missing'
|
||||
|
||||
print(
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(24),
|
||||
(folder["path"] or '').ljust(70),
|
||||
num_files.ljust(14),
|
||||
ANSI[color],
|
||||
note,
|
||||
ANSI['reset'],
|
||||
)
|
||||
|
||||
|
||||
def print_dependency_version(name, dependency):
|
||||
if dependency['enabled']:
|
||||
if dependency['is_valid']:
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
|
||||
else:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
print(
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(24),
|
||||
(dependency["path"] or '').ljust(70),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note,
|
||||
ANSI['reset'],
|
||||
version(
|
||||
quiet=command.quiet,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
@ -5,8 +7,8 @@ from datetime import datetime
|
|||
from dataclasses import dataclass
|
||||
from typing import Optional, List
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
from .config import ANSI, OUTPUT_DIR, IS_TTY
|
||||
from ..index.schema import Link, ArchiveResult
|
||||
from ..config import ANSI, OUTPUT_DIR, IS_TTY
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -80,7 +82,7 @@ def log_indexing_finished(out_path: str):
|
|||
|
||||
### Archiving Stage
|
||||
|
||||
def log_archiving_started(num_links: int, resume: Optional[float]):
|
||||
def log_archiving_started(num_links: int, resume: Optional[float]=None):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
||||
print()
|
||||
|
@ -92,7 +94,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]):
|
|||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||
print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
|
@ -213,18 +215,18 @@ def log_archive_method_finished(result: ArchiveResult):
|
|||
print()
|
||||
|
||||
|
||||
def log_list_started(filter_patterns: List[str], filter_type: str):
|
||||
def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||
print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
|
||||
filter_type,
|
||||
**ANSI,
|
||||
))
|
||||
print(' {}'.format(' '.join(filter_patterns)))
|
||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||
|
||||
def log_list_finished(links):
|
||||
from .util import to_csv
|
||||
from ..util import links_to_csv
|
||||
print()
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print()
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox'
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import os
|
||||
|
@ -29,15 +29,15 @@ TEST_CONFIG = {
|
|||
OUTPUT_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
from .legacy.main import init
|
||||
from .legacy.index import load_main_index
|
||||
from .legacy.config import (
|
||||
from ..main import init
|
||||
from ..index import load_main_index
|
||||
from ..config import (
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
)
|
||||
|
||||
from .cli import (
|
||||
from . import (
|
||||
archivebox_init,
|
||||
archivebox_add,
|
||||
archivebox_remove,
|
|
@ -1,4 +1,4 @@
|
|||
__package__ = 'archivebox.legacy'
|
||||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import io
|
||||
|
@ -13,7 +13,7 @@ from typing import Optional, Type, Tuple, Dict
|
|||
from subprocess import run, PIPE, DEVNULL
|
||||
from configparser import ConfigParser
|
||||
|
||||
from .config_stubs import (
|
||||
from .stubs import (
|
||||
SimpleConfigValueDict,
|
||||
ConfigValue,
|
||||
ConfigDict,
|
||||
|
@ -40,7 +40,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
|||
'GENERAL_CONFIG': {
|
||||
'OUTPUT_DIR': {'type': str, 'default': None},
|
||||
'CONFIG_FILE': {'type': str, 'default': None},
|
||||
'ONLY_NEW': {'type': bool, 'default': False},
|
||||
'ONLY_NEW': {'type': bool, 'default': True},
|
||||
'TIMEOUT': {'type': int, 'default': 60},
|
||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
|
||||
|
@ -122,8 +122,7 @@ ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
|
|||
|
||||
VERSION_FILENAME = 'VERSION'
|
||||
PYTHON_DIR_NAME = 'archivebox'
|
||||
LEGACY_DIR_NAME = 'legacy'
|
||||
TEMPLATES_DIR_NAME = 'templates'
|
||||
TEMPLATES_DIR_NAME = 'themes'
|
||||
|
||||
ARCHIVE_DIR_NAME = 'archive'
|
||||
SOURCES_DIR_NAME = 'sources'
|
||||
|
@ -158,8 +157,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
|||
|
||||
'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
|
||||
'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
|
||||
'LEGACY_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
|
||||
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
|
||||
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
|
||||
|
||||
'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
|
||||
'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
|
||||
|
@ -210,7 +208,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
|||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
'CONFIG_LOCATIONS': {'default': lambda c: get_config_locations(c)},
|
||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||
}
|
||||
|
@ -370,6 +368,7 @@ def load_config(defaults: ConfigDefaultDict,
|
|||
stderr(' For config documentation and examples see:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
|
||||
stderr()
|
||||
raise
|
||||
raise SystemExit(2)
|
||||
|
||||
return extended_config
|
||||
|
@ -492,18 +491,13 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
'REPO_DIR': {
|
||||
'path': os.path.abspath(config['REPO_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
|
||||
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
|
||||
},
|
||||
'PYTHON_DIR': {
|
||||
'path': os.path.abspath(config['PYTHON_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
|
||||
},
|
||||
'LEGACY_DIR': {
|
||||
'path': os.path.abspath(config['LEGACY_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': os.path.abspath(config['TEMPLATES_DIR']),
|
||||
'enabled': True,
|
||||
|
@ -511,14 +505,9 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
},
|
||||
}
|
||||
|
||||
def get_config_locations(config: ConfigDict) -> ConfigValue:
|
||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
||||
abspath = lambda path: None if path is None else os.path.abspath(path)
|
||||
return {
|
||||
'CONFIG_FILE': {
|
||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
|
||||
},
|
||||
'CHROME_USER_DATA_DIR': {
|
||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
|
@ -553,11 +542,26 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': True,
|
||||
'is_valid': os.path.exists(config['ARCHIVE_DIR']),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': os.path.abspath(config['CONFIG_FILE']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(config['CONFIG_FILE']),
|
||||
},
|
||||
'SQL_INDEX': {
|
||||
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
|
||||
},
|
||||
'JSON_INDEX': {
|
||||
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
|
||||
},
|
||||
'HTML_INDEX': {
|
||||
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
|
||||
},
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
|
@ -731,7 +735,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
|
|||
|
||||
json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
|
||||
if not json_index_exists:
|
||||
stderr('[X] No archive index was found in current directory.', color='red')
|
||||
stderr('[X] No archive main index was found in current directory.', color='red')
|
||||
stderr(f' {output_dir}')
|
||||
stderr()
|
||||
stderr(' Are you running archivebox in the right folder?')
|
||||
|
@ -743,7 +747,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
|
|||
raise SystemExit(2)
|
||||
|
||||
sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
|
||||
from .storage.sql import list_migrations
|
||||
from ..index.sql import list_migrations
|
||||
|
||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
||||
|
|
@ -17,6 +17,7 @@ class ConfigDict(BaseConfig, total=False):
|
|||
SHOW_PROGRESS: bool
|
||||
|
||||
OUTPUT_DIR: str
|
||||
CONFIG_FILE: str
|
||||
ONLY_NEW: bool
|
||||
TIMEOUT: int
|
||||
MEDIA_TIMEOUT: int
|
||||
|
@ -63,7 +64,6 @@ class ConfigDict(BaseConfig, total=False):
|
|||
ANSI: Dict[str, str]
|
||||
REPO_DIR: str
|
||||
PYTHON_DIR: str
|
||||
LEGACY_DIR: str
|
||||
TEMPLATES_DIR: str
|
||||
ARCHIVE_DIR: str
|
||||
SOURCES_DIR: str
|
|
@ -1,9 +1,7 @@
|
|||
|
||||
from datetime import datetime
|
||||
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import Page
|
||||
from core.models import Page
|
||||
|
||||
|
||||
class PageAdmin(admin.ModelAdmin):
|
||||
list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')
|
||||
|
|
|
@ -4,8 +4,8 @@ import uuid
|
|||
|
||||
from django.db import models
|
||||
|
||||
from legacy.schema import Link
|
||||
from legacy.util import parse_date
|
||||
from ..util import parse_date
|
||||
from ..index.schema import Link
|
||||
|
||||
|
||||
class Page(models.Model):
|
||||
|
|
|
@ -2,8 +2,8 @@ from django.shortcuts import render
|
|||
|
||||
from django.views import View
|
||||
|
||||
from legacy.config import OUTPUT_DIR
|
||||
from legacy.index import load_main_index, load_main_index_meta
|
||||
from .index import load_main_index, load_main_index_meta
|
||||
from .config import OUTPUT_DIR
|
||||
|
||||
|
||||
class MainIndex(View):
|
||||
|
@ -34,7 +34,7 @@ class AddLinks(View):
|
|||
def post(self, request):
|
||||
import_path = request.POST['url']
|
||||
|
||||
# TODO: add the links to the index here using archivebox.legacy.main.update_archive_data
|
||||
# TODO: add the links to the index here using archivebox.main.add
|
||||
print(f'Adding URL: {import_path}')
|
||||
|
||||
return render(template_name=self.template, request=request, context={})
|
||||
|
|
|
@ -1,4 +1,17 @@
|
|||
print()
|
||||
print('[i] Welcome to the ArchiveBox Shell! Example usage:')
|
||||
print(' Page.objects.all()')
|
||||
print(' User.objects.all()')
|
||||
from cli import list_subcommands
|
||||
|
||||
from .config import ANSI
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
|
||||
# print('from archivebox.core.models import Page, User')
|
||||
print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
|
||||
print()
|
||||
print('[i] Welcome to the ArchiveBox Shell! Example use:')
|
||||
print(' print(Page.objects.filter(is_archived=True).count())')
|
||||
print(' Page.objects.get(url="https://example.com").as_json()')
|
||||
|
||||
print(' Page.objects.get(url="https://example.com").as_json()')
|
||||
|
||||
print(' from archivebox.main import get_invalid_folders')
|
||||
|
|
105
archivebox/extractors/__init__.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..index import (
|
||||
load_link_details,
|
||||
write_link_details,
|
||||
patch_main_index,
|
||||
)
|
||||
from ..util import enforce_types
|
||||
from ..cli.logging import (
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
log_archive_method_started,
|
||||
log_archive_method_finished,
|
||||
)
|
||||
|
||||
from .title import should_save_title, save_title
|
||||
from .favicon import should_save_favicon, save_favicon
|
||||
from .wget import should_save_wget, save_wget
|
||||
from .pdf import should_save_pdf, save_pdf
|
||||
from .screenshot import should_save_screenshot, save_screenshot
|
||||
from .dom import should_save_dom, save_dom
|
||||
from .git import should_save_git, save_git
|
||||
from .media import should_save_media, save_media
|
||||
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
||||
|
||||
|
||||
@enforce_types
|
||||
def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) -> Link:
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
ARCHIVE_METHODS = (
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
)
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
try:
|
||||
is_new = not os.path.exists(out_dir)
|
||||
if is_new:
|
||||
os.makedirs(out_dir)
|
||||
|
||||
link = load_link_details(link, out_dir=out_dir)
|
||||
log_link_archiving_started(link, out_dir, is_new)
|
||||
link = link.overwrite(updated=datetime.now())
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||
try:
|
||||
if method_name not in link.history:
|
||||
link.history[method_name] = []
|
||||
|
||||
if should_run(link, out_dir) or overwrite:
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(link=link, out_dir=out_dir)
|
||||
|
||||
link.history[method_name].append(result)
|
||||
|
||||
stats[result.status] += 1
|
||||
log_archive_method_finished(result)
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
patch_main_index(link)
|
||||
|
||||
# # If any changes were made, update the main links index json and html
|
||||
# was_changed = stats['succeeded'] or stats['failed']
|
||||
# if was_changed:
|
||||
# patch_main_index(link)
|
||||
|
||||
log_link_archiving_finished(link, link.link_dir, is_new, stats)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
except:
|
||||
pass
|
||||
raise
|
||||
|
||||
except Exception as err:
|
||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||
raise
|
||||
|
||||
return link
|
115
archivebox/extractors/archive_org.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
DEVNULL,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
VERSION,
|
||||
TIMEOUT,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
|
||||
# if open(path, 'r').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
return SAVE_ARCHIVE_DOT_ORG
|
||||
|
||||
@enforce_types
|
||||
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--location',
|
||||
'--head',
|
||||
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
||||
'--max-time', str(timeout),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
|
||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||
if content_location:
|
||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||
archive_org_url = None
|
||||
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
|
||||
elif errors:
|
||||
raise ArchiveError(', '.join(errors))
|
||||
else:
|
||||
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
if output and not isinstance(output, Exception):
|
||||
# instead of writing None when archive.org rejects the url write the
|
||||
# url to resubmit it to archive.org. This is so when the user visits
|
||||
# the URL in person, it will attempt to re-archive it, and it'll show the
|
||||
# nicer error message explaining why the url was rejected if it fails.
|
||||
archive_org_url = archive_org_url or submit_url
|
||||
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
|
||||
f.write(archive_org_url)
|
||||
chmod_file('archive.org.txt', cwd=out_dir)
|
||||
output = archive_org_url
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
|
||||
# Parse archive.org response headers
|
||||
headers: Dict[str, List[str]] = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
content_location = headers['content-location']
|
||||
errors = headers['x-archive-wayback-runtime-error']
|
||||
return content_location, errors
|
||||
|
73
archivebox/extractors/dom.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chrome_args,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_DOM,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.html')):
|
||||
return False
|
||||
|
||||
return SAVE_DOM
|
||||
|
||||
@enforce_types
|
||||
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'output.html'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--dump-dom',
|
||||
link.url
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
with open(output_path, 'w+') as f:
|
||||
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = result.stderr.decode()
|
||||
raise ArchiveError('Failed to save DOM', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
65
archivebox/extractors/favicon.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
domain,
|
||||
run,
|
||||
PIPE,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_FAVICON,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
|
||||
return False
|
||||
|
||||
return SAVE_FAVICON
|
||||
|
||||
@enforce_types
|
||||
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--output', str(output),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
94
archivebox/extractors/git.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chmod_file,
|
||||
domain,
|
||||
extension,
|
||||
without_query,
|
||||
without_fragment,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_GIT,
|
||||
GIT_BINARY,
|
||||
GIT_VERSION,
|
||||
GIT_DOMAINS,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'git')):
|
||||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
(domain(link.url) in GIT_DOMAINS)
|
||||
or (extension(link.url) == 'git')
|
||||
)
|
||||
if not is_clonable_url:
|
||||
return False
|
||||
|
||||
return SAVE_GIT
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using git"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'git'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
GIT_BINARY,
|
||||
'clone',
|
||||
'--mirror',
|
||||
'--recursive',
|
||||
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
|
||||
without_query(without_fragment(link.url)),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
pass
|
||||
elif result.returncode > 0:
|
||||
hints = 'Got git response code: {}.'.format(result.returncode)
|
||||
raise ArchiveError('Failed to save git clone', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=GIT_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
100
archivebox/extractors/media.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
MEDIA_TIMEOUT,
|
||||
SAVE_MEDIA,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'media')):
|
||||
return False
|
||||
|
||||
return SAVE_MEDIA
|
||||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'media'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--yes-playlist',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--no-check-certificate',
|
||||
'--user-agent',
|
||||
'--all-subs',
|
||||
'--extract-audio',
|
||||
'--keep-video',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--audio-format', 'mp3',
|
||||
'--audio-quality', '320K',
|
||||
'--embed-thumbnail',
|
||||
'--add-metadata',
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=YOUTUBEDL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
72
archivebox/extractors/pdf.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chrome_args,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_PDF,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
|
||||
return False
|
||||
|
||||
return SAVE_PDF
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'output.pdf'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--print-to-pdf',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save PDF', hints)
|
||||
|
||||
chmod_file('output.pdf', cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
71
archivebox/extractors/screenshot.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chrome_args,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SCREENSHOT,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
|
||||
return False
|
||||
|
||||
return SAVE_SCREENSHOT
|
||||
|
||||
@enforce_types
|
||||
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'screenshot.png'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--screenshot',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save screenshot', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
63
archivebox/extractors/title.py
Normal file
|
@ -0,0 +1,63 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
fetch_page_title,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
if link.title and not link.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
return SAVE_TITLE
|
||||
|
||||
@enforce_types
|
||||
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
link.url,
|
||||
'|',
|
||||
'grep',
|
||||
'<title',
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output = fetch_page_title(link.url, timeout=timeout, progress=False)
|
||||
if not output:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
123
archivebox/extractors/wget.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
wget_output_path,
|
||||
ArchiveError,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_WGET,
|
||||
SAVE_WARC,
|
||||
WGET_BINARY,
|
||||
WGET_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_WGET_REQUISITES,
|
||||
WGET_AUTO_COMPRESSION,
|
||||
WGET_USER_AGENT,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
output_path = wget_output_path(link)
|
||||
out_dir = out_dir or link.link_dir
|
||||
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
|
||||
return False
|
||||
|
||||
return SAVE_WGET
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using wget"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
if SAVE_WARC:
|
||||
warc_dir = os.path.join(out_dir, 'warc')
|
||||
os.makedirs(warc_dir, exist_ok=True)
|
||||
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
||||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
'--restrict-file-names=windows',
|
||||
'--timeout={}'.format(timeout),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
|
||||
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
|
||||
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
|
||||
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
|
||||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
output = wget_output_path(link)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
if line.strip()
|
||||
]
|
||||
files_downloaded = (
|
||||
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
||||
if 'Downloaded:' in output_tail[-1]
|
||||
else 0
|
||||
)
|
||||
|
||||
# Check for common failure cases
|
||||
if result.returncode > 0 and files_downloaded < 1:
|
||||
hints = (
|
||||
'Got wget response code: {}.'.format(result.returncode),
|
||||
*output_tail,
|
||||
)
|
||||
if b'403: Forbidden' in result.stderr:
|
||||
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
|
||||
if b'404: Not Found' in result.stderr:
|
||||
raise ArchiveError('404 Not Found', hints)
|
||||
if b'ERROR 500: Internal Server Error' in result.stderr:
|
||||
raise ArchiveError('500 Internal Server Error', hints)
|
||||
raise ArchiveError('Got an error from the server', hints)
|
||||
|
||||
# chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=WGET_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
|
@ -1,14 +1,25 @@
|
|||
__package__ = 'archivebox.legacy'
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import json as pyjson
|
||||
|
||||
from typing import List, Tuple, Optional, Iterable
|
||||
from itertools import chain
|
||||
from typing import List, Tuple, Dict, Optional, Iterable
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
from .config import (
|
||||
from ..parsers import parse_links
|
||||
from ..util import (
|
||||
scheme,
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
atomic_write,
|
||||
ExtendedEncoder,
|
||||
)
|
||||
from ..config import (
|
||||
ARCHIVE_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
|
@ -18,26 +29,7 @@ from .config import (
|
|||
ANSI,
|
||||
stderr,
|
||||
)
|
||||
from .storage.html import write_html_main_index, write_html_link_details
|
||||
from .storage.json import (
|
||||
parse_json_main_index,
|
||||
write_json_main_index,
|
||||
parse_json_link_details,
|
||||
write_json_link_details,
|
||||
)
|
||||
from .storage.sql import (
|
||||
write_sql_main_index,
|
||||
parse_sql_main_index,
|
||||
)
|
||||
from .util import (
|
||||
scheme,
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
atomic_write,
|
||||
ExtendedEncoder,
|
||||
)
|
||||
from .parse import parse_links
|
||||
from .logs import (
|
||||
from ..cli.logging import (
|
||||
log_indexing_process_started,
|
||||
log_indexing_process_finished,
|
||||
log_indexing_started,
|
||||
|
@ -46,6 +38,22 @@ from .logs import (
|
|||
log_parsing_finished,
|
||||
)
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
from .html import (
|
||||
write_html_main_index,
|
||||
write_html_link_details,
|
||||
)
|
||||
from .json import (
|
||||
parse_json_main_index,
|
||||
write_json_main_index,
|
||||
parse_json_link_details,
|
||||
write_json_link_details,
|
||||
)
|
||||
from .sql import (
|
||||
write_sql_main_index,
|
||||
parse_sql_main_index,
|
||||
)
|
||||
|
||||
### Link filtering and checking
|
||||
|
||||
@enforce_types
|
||||
|
@ -95,11 +103,11 @@ def merge_links(a: Link, b: Link) -> Link:
|
|||
}
|
||||
for method in all_methods:
|
||||
deduped_jsons = {
|
||||
json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
|
||||
pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
|
||||
for result in history[method]
|
||||
}
|
||||
history[method] = list(reversed(sorted(
|
||||
(ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
|
||||
(ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
|
||||
key=lambda result: result.start_ts,
|
||||
)))
|
||||
|
||||
|
@ -114,7 +122,7 @@ def merge_links(a: Link, b: Link) -> Link:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||
links = sorted_links(links) # deterministically sort the links based on timstamp, url
|
||||
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
||||
|
@ -128,7 +136,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
|
|||
stderr(' archivebox help')
|
||||
raise SystemExit(1)
|
||||
|
||||
return links
|
||||
return list(links)
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -259,23 +267,32 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
|
|||
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
meta_dict = json.load(f)
|
||||
meta_dict = pyjson.load(f)
|
||||
meta_dict.pop('links')
|
||||
return meta_dict
|
||||
|
||||
return None
|
||||
|
||||
@enforce_types
|
||||
def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
|
||||
def import_new_links(existing_links: List[Link],
|
||||
import_path: str,
|
||||
out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
|
||||
|
||||
new_links: List[Link] = []
|
||||
|
||||
# parse and validate the import file
|
||||
log_parsing_started(import_path)
|
||||
raw_links, parser_name = parse_links(import_path)
|
||||
new_links = list(validate_links(raw_links))
|
||||
new_links = validate_links(raw_links)
|
||||
|
||||
# merge existing links in out_dir and new links
|
||||
all_links = list(validate_links(existing_links + new_links))
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
all_link_urls = {link.url for link in existing_links}
|
||||
|
||||
new_links = [
|
||||
link for link in new_links
|
||||
if link.url not in all_link_urls
|
||||
]
|
||||
|
||||
if parser_name:
|
||||
num_parsed = len(raw_links)
|
||||
|
@ -345,3 +362,231 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
|
|||
return merge_links(existing_link, link)
|
||||
|
||||
return link
|
||||
|
||||
|
||||
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
|
||||
'substring': lambda link, pattern: pattern in link.url,
|
||||
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
|
||||
'domain': lambda link, pattern: link.domain == pattern,
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
|
||||
for pattern in filter_patterns:
|
||||
try:
|
||||
if LINK_FILTERS[filter_type](link, pattern):
|
||||
return True
|
||||
except Exception:
|
||||
stderr()
|
||||
stderr(
|
||||
f'[X] Got invalid pattern for --filter-type={filter_type}:',
|
||||
color='red',
|
||||
)
|
||||
stderr(f' {pattern}')
|
||||
raise SystemExit(2)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
}
|
||||
|
||||
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
}
|
||||
|
||||
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
}
|
||||
|
||||
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that are expected to exist based on the main index"""
|
||||
all_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_folders[entry.path] = link
|
||||
|
||||
return all_folders
|
||||
|
||||
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_valid, links)
|
||||
}
|
||||
|
||||
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
|
||||
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
|
||||
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
|
||||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||
|
||||
|
||||
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
||||
links = list(links)
|
||||
by_url = {link.url: 0 for link in links}
|
||||
by_timestamp = {link.timestamp: 0 for link in links}
|
||||
|
||||
duplicate_folders = {}
|
||||
|
||||
indexed_folders = {link.link_dir for link in links}
|
||||
data_folders = (
|
||||
entry.path
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
|
||||
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
|
||||
)
|
||||
|
||||
for path in chain(sorted(indexed_folders), sorted(data_folders)):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link:
|
||||
# link folder has same timestamp as different link folder
|
||||
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
|
||||
if by_timestamp[link.timestamp] > 1:
|
||||
duplicate_folders[path] = link
|
||||
|
||||
# link folder has same url as different link folder
|
||||
by_url[link.url] = by_url.get(link.url, 0) + 1
|
||||
if by_url[link.url] > 1:
|
||||
duplicate_folders[path] = link
|
||||
|
||||
return duplicate_folders
|
||||
|
||||
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||
links = list(links)
|
||||
indexed_folders = {link.link_dir: link for link in links}
|
||||
orphaned_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link and entry.path not in indexed_folders:
|
||||
# folder is a valid link data dir with index details, but it's not in the main index
|
||||
orphaned_folders[entry.path] = link
|
||||
|
||||
return orphaned_folders
|
||||
|
||||
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_corrupt, links)
|
||||
}
|
||||
|
||||
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||
by_timestamp = {link.timestamp: 0 for link in links}
|
||||
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if index_exists and link is None:
|
||||
# index exists but it's corrupted or unparseable
|
||||
unrecognized_folders[entry.path] = link
|
||||
|
||||
elif not index_exists:
|
||||
# link details index doesn't exist and the folder isn't in the main index
|
||||
timestamp = entry.path.rsplit('/', 1)[-1]
|
||||
if timestamp not in by_timestamp:
|
||||
unrecognized_folders[entry.path] = link
|
||||
|
||||
return unrecognized_folders
|
||||
|
||||
|
||||
def is_valid(link: Link) -> bool:
|
||||
dir_exists = os.path.exists(link.link_dir)
|
||||
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
|
||||
if not dir_exists:
|
||||
# unarchived links are not included in the valid list
|
||||
return False
|
||||
if dir_exists and not index_exists:
|
||||
return False
|
||||
if dir_exists and index_exists:
|
||||
try:
|
||||
parsed_link = parse_json_link_details(link.link_dir)
|
||||
return link.url == parsed_link.url
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def is_corrupt(link: Link) -> bool:
|
||||
if not os.path.exists(link.link_dir):
|
||||
# unarchived links are not considered corrupt
|
||||
return False
|
||||
|
||||
if is_valid(link):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_archived(link: Link) -> bool:
|
||||
return is_valid(link) and link.is_archived
|
||||
|
||||
def is_unarchived(link: Link) -> bool:
|
||||
if not os.path.exists(link.link_dir):
|
||||
return True
|
||||
return not link.is_archived
|
||||
|
||||
|
||||
def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
||||
fixed = []
|
||||
cant_fix = []
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if os.path.exists(os.path.join(entry.path, 'index.json')):
|
||||
link = parse_json_link_details(entry.path)
|
||||
if not link:
|
||||
continue
|
||||
|
||||
if not entry.path.endswith(f'/{link.timestamp}'):
|
||||
dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
|
||||
if os.path.exists(dest):
|
||||
cant_fix.append(entry.path)
|
||||
else:
|
||||
shutil.move(entry.path, dest)
|
||||
fixed.append(dest)
|
||||
|
||||
if link.link_dir != entry.path:
|
||||
link = link.overwrite(link_dir=entry.path)
|
||||
write_json_link_details(link, out_dir=entry.path)
|
||||
|
||||
return fixed, cant_fix
|
|
@ -1,11 +1,22 @@
|
|||
__package__ = 'archivebox.legacy.storage'
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator
|
||||
|
||||
from ..schema import Link
|
||||
from .schema import Link
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
ts_to_date,
|
||||
urlencode,
|
||||
htmlencode,
|
||||
urldecode,
|
||||
wget_output_path,
|
||||
render_template,
|
||||
atomic_write,
|
||||
copy_and_overwrite,
|
||||
)
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
TEMPLATES_DIR,
|
||||
|
@ -18,17 +29,6 @@ from ..config import (
|
|||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
)
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
ts_to_date,
|
||||
urlencode,
|
||||
htmlencode,
|
||||
urldecode,
|
||||
wget_output_path,
|
||||
render_template,
|
||||
atomic_write,
|
||||
copy_and_overwrite,
|
||||
)
|
||||
|
||||
join = lambda *paths: os.path.join(*paths)
|
||||
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
|
|
@ -1,4 +1,4 @@
|
|||
__package__ = 'archivebox.legacy.storage'
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
@ -7,7 +7,8 @@ import json
|
|||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator
|
||||
|
||||
from ..schema import Link, ArchiveResult
|
||||
from .schema import Link, ArchiveResult
|
||||
from ..util import enforce_types, atomic_write
|
||||
from ..config import (
|
||||
VERSION,
|
||||
OUTPUT_DIR,
|
||||
|
@ -17,14 +18,11 @@ from ..config import (
|
|||
JSON_INDEX_FILENAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
)
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
atomic_write,
|
||||
)
|
||||
|
||||
|
||||
MAIN_INDEX_HEADER = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.legacy.storage.json',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
|
@ -43,7 +41,7 @@ MAIN_INDEX_HEADER = {
|
|||
|
||||
@enforce_types
|
||||
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
"""parse a archive index json file and return the list of links"""
|
||||
"""parse an archive index json file and return the list of links"""
|
||||
|
||||
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(index_path):
|
||||
|
@ -110,4 +108,6 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
|
|||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if os.path.exists(os.path.join(entry.path, 'index.json')):
|
||||
yield parse_json_link_details(entry.path)
|
||||
link = parse_json_link_details(entry.path)
|
||||
if link:
|
||||
yield link
|
|
@ -1,3 +1,5 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -48,7 +50,7 @@ class ArchiveResult:
|
|||
|
||||
@classmethod
|
||||
def from_json(cls, json_info):
|
||||
from .util import parse_date
|
||||
from ..util import parse_date
|
||||
|
||||
info = {
|
||||
key: val
|
||||
|
@ -60,12 +62,12 @@ class ArchiveResult:
|
|||
return cls(**info)
|
||||
|
||||
def to_json(self, indent=4, sort_keys=True):
|
||||
from .util import to_json
|
||||
from ..util import to_json
|
||||
|
||||
return to_json(self, indent=indent, sort_keys=sort_keys)
|
||||
|
||||
def to_csv(self, cols=None, ljust: int=0, separator: str=','):
|
||||
from .util import to_json
|
||||
from ..util import to_json
|
||||
|
||||
cols = cols or self.field_names()
|
||||
return separator.join(
|
||||
|
@ -115,7 +117,7 @@ class Link:
|
|||
return float(self.timestamp) > float(other.timestamp)
|
||||
|
||||
def typecheck(self) -> None:
|
||||
from .config import stderr, ANSI
|
||||
from ..config import stderr, ANSI
|
||||
try:
|
||||
assert self.schema == self.__class__.__name__
|
||||
assert isinstance(self.timestamp, str) and self.timestamp
|
||||
|
@ -176,7 +178,7 @@ class Link:
|
|||
|
||||
@classmethod
|
||||
def from_json(cls, json_info):
|
||||
from .util import parse_date
|
||||
from ..util import parse_date
|
||||
|
||||
info = {
|
||||
key: val
|
||||
|
@ -200,12 +202,12 @@ class Link:
|
|||
return cls(**info)
|
||||
|
||||
def to_json(self, indent=4, sort_keys=True):
|
||||
from .util import to_json
|
||||
from ..util import to_json
|
||||
|
||||
return to_json(self, indent=indent, sort_keys=sort_keys)
|
||||
|
||||
def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
|
||||
from .util import to_json
|
||||
from ..util import to_json
|
||||
|
||||
return separator.join(
|
||||
to_json(getattr(self, col), indent=None).ljust(ljust)
|
||||
|
@ -218,60 +220,60 @@ class Link:
|
|||
|
||||
@property
|
||||
def link_dir(self) -> str:
|
||||
from .config import CONFIG
|
||||
from ..config import CONFIG
|
||||
return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
|
||||
|
||||
@property
|
||||
def archive_path(self) -> str:
|
||||
from .config import ARCHIVE_DIR_NAME
|
||||
from ..config import ARCHIVE_DIR_NAME
|
||||
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
||||
|
||||
### URL Helpers
|
||||
@property
|
||||
def url_hash(self):
|
||||
from .util import hashurl
|
||||
from ..util import hashurl
|
||||
|
||||
return hashurl(self.url)
|
||||
|
||||
@property
|
||||
def scheme(self) -> str:
|
||||
from .util import scheme
|
||||
from ..util import scheme
|
||||
return scheme(self.url)
|
||||
|
||||
@property
|
||||
def extension(self) -> str:
|
||||
from .util import extension
|
||||
from ..util import extension
|
||||
return extension(self.url)
|
||||
|
||||
@property
|
||||
def domain(self) -> str:
|
||||
from .util import domain
|
||||
from ..util import domain
|
||||
return domain(self.url)
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
from .util import path
|
||||
from ..util import path
|
||||
return path(self.url)
|
||||
|
||||
@property
|
||||
def basename(self) -> str:
|
||||
from .util import basename
|
||||
from ..util import basename
|
||||
return basename(self.url)
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
from .util import base_url
|
||||
from ..util import base_url
|
||||
return base_url(self.url)
|
||||
|
||||
### Pretty Printing Helpers
|
||||
@property
|
||||
def bookmarked_date(self) -> Optional[str]:
|
||||
from .util import ts_to_date
|
||||
from ..util import ts_to_date
|
||||
return ts_to_date(self.timestamp) if self.timestamp else None
|
||||
|
||||
@property
|
||||
def updated_date(self) -> Optional[str]:
|
||||
from .util import ts_to_date
|
||||
from ..util import ts_to_date
|
||||
return ts_to_date(self.updated) if self.updated else None
|
||||
|
||||
@property
|
||||
|
@ -304,13 +306,13 @@ class Link:
|
|||
|
||||
@property
|
||||
def is_static(self) -> bool:
|
||||
from .util import is_static_file
|
||||
from ..util import is_static_file
|
||||
return is_static_file(self.url)
|
||||
|
||||
@property
|
||||
def is_archived(self) -> bool:
|
||||
from .config import ARCHIVE_DIR
|
||||
from .util import domain
|
||||
from ..config import ARCHIVE_DIR
|
||||
from ..util import domain
|
||||
|
||||
output_paths = (
|
||||
domain(self.url),
|
||||
|
@ -352,7 +354,7 @@ class Link:
|
|||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""predict the expected output paths that should be present after archiving"""
|
||||
|
||||
from .util import wget_output_path
|
||||
from ..util import wget_output_path
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'favicon_path': 'favicon.ico',
|
|
@ -1,9 +1,9 @@
|
|||
__package__ = 'archivebox.legacy.storage'
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
from io import StringIO
|
||||
from typing import List, Tuple, Iterator
|
||||
|
||||
from ..schema import Link
|
||||
from .schema import Link
|
||||
from ..util import enforce_types
|
||||
from ..config import setup_django, OUTPUT_DIR
|
||||
|
||||
|
@ -25,9 +25,19 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
|||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Page
|
||||
|
||||
for link in links:
|
||||
all_urls = {link.url: link for link in links}
|
||||
|
||||
for page in Page.objects.all():
|
||||
if page.url in all_urls:
|
||||
info = {k: v for k, v in all_urls.pop(page.url)._asdict().items() if k in Page.keys}
|
||||
Page.objects.update(**info)
|
||||
else:
|
||||
page.delete()
|
||||
|
||||
for url, link in all_urls.items():
|
||||
info = {k: v for k, v in link._asdict().items() if k in Page.keys}
|
||||
Page.objects.update_or_create(url=link.url, defaults=info)
|
||||
Page.objects.update_or_create(url=url, defaults=info)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
|
@ -1,58 +0,0 @@
|
|||
# This is the example default configiration file for ArchiveBox.
|
||||
#
|
||||
# Copy example config from here into your project's ArchiveBox.conf file,
|
||||
# DO NOT EDIT THIS FILE DIRECTLY!
|
||||
#
|
||||
# See the list of all the possible options. documentation, and examples here:
|
||||
# https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||
|
||||
[GENERAL_CONFIG]
|
||||
OUTPUT_PERMISSIONS = 755
|
||||
ONLY_NEW = False
|
||||
TIMEOUT = 60
|
||||
MEDIA_TIMEOUT = 3600
|
||||
ACTIVE_THEME = default
|
||||
FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
|
||||
URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$)
|
||||
|
||||
[ARCHIVE_METHOD_TOGGLES]
|
||||
SAVE_TITLE = True
|
||||
SAVE_FAVICON = True
|
||||
SAVE_WGET = True
|
||||
SAVE_WGET_REQUISITES = True
|
||||
SAVE_WARC = True
|
||||
SAVE_PDF = True
|
||||
SAVE_SCREENSHOT = True
|
||||
SAVE_DOM = True
|
||||
SAVE_GIT = True
|
||||
SAVE_MEDIA = False
|
||||
SAVE_ARCHIVE_DOT_ORG = True
|
||||
|
||||
|
||||
[ARCHIVE_METHOD_OPTIONS]
|
||||
CHECK_SSL_VALIDITY = True
|
||||
RESOLUTION = 1440,900
|
||||
GIT_DOMAINS = github.com,bitbucket.org,gitlab.com
|
||||
|
||||
CROME_HEADLESS = True
|
||||
CROME_SANDBOX = True
|
||||
|
||||
COOKIES_FILE = path/to/cookies.txt
|
||||
CHROME_USER_DATA_DIR = ~/.config/google-chrome/Default
|
||||
|
||||
WGET_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
|
||||
CHROME_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
|
||||
|
||||
|
||||
[DEPENDENCY_CONFIG]
|
||||
USE_CURL = True
|
||||
USE_WGET = True
|
||||
USE_CHROME = True
|
||||
USE_YOUTUBEDL = True
|
||||
USE_GIT = True
|
||||
|
||||
CURL_BINARY = curl
|
||||
GIT_BINARY = git"
|
||||
WGET_BINARY = wget
|
||||
YOUTUBEDL_BINARY = youtube-dl
|
||||
CHROME_BINARY = chromium
|
|
@ -1 +0,0 @@
|
|||
__package__ = 'archivebox.legacy'
|
|
@ -1,694 +0,0 @@
|
|||
import os
|
||||
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
from .schema import Link, ArchiveResult, ArchiveOutput
|
||||
from .index import (
|
||||
load_link_details,
|
||||
write_link_details,
|
||||
patch_main_index,
|
||||
)
|
||||
from .config import (
|
||||
CURL_BINARY,
|
||||
GIT_BINARY,
|
||||
WGET_BINARY,
|
||||
YOUTUBEDL_BINARY,
|
||||
SAVE_FAVICON,
|
||||
SAVE_TITLE,
|
||||
SAVE_WGET,
|
||||
SAVE_WGET_REQUISITES,
|
||||
SAVE_PDF,
|
||||
SAVE_SCREENSHOT,
|
||||
SAVE_DOM,
|
||||
SAVE_WARC,
|
||||
SAVE_GIT,
|
||||
SAVE_MEDIA,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
TIMEOUT,
|
||||
MEDIA_TIMEOUT,
|
||||
GIT_DOMAINS,
|
||||
VERSION,
|
||||
WGET_USER_AGENT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
COOKIES_FILE,
|
||||
CURL_VERSION,
|
||||
WGET_VERSION,
|
||||
CHROME_VERSION,
|
||||
GIT_VERSION,
|
||||
YOUTUBEDL_VERSION,
|
||||
WGET_AUTO_COMPRESSION,
|
||||
)
|
||||
from .util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
extension,
|
||||
without_query,
|
||||
without_fragment,
|
||||
fetch_page_title,
|
||||
is_static_file,
|
||||
TimedProgress,
|
||||
chmod_file,
|
||||
wget_output_path,
|
||||
chrome_args,
|
||||
run, PIPE, DEVNULL,
|
||||
)
|
||||
from .logs import (
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
log_archive_method_started,
|
||||
log_archive_method_finished,
|
||||
)
|
||||
|
||||
|
||||
class ArchiveError(Exception):
|
||||
def __init__(self, message, hints=None):
|
||||
super().__init__(message)
|
||||
self.hints = hints
|
||||
|
||||
|
||||
@enforce_types
|
||||
def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
ARCHIVE_METHODS = (
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
)
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
try:
|
||||
is_new = not os.path.exists(out_dir)
|
||||
if is_new:
|
||||
os.makedirs(out_dir)
|
||||
|
||||
link = load_link_details(link, out_dir=out_dir)
|
||||
log_link_archiving_started(link, out_dir, is_new)
|
||||
link = link.overwrite(updated=datetime.now())
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||
try:
|
||||
if method_name not in link.history:
|
||||
link.history[method_name] = []
|
||||
|
||||
if should_run(link, out_dir):
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(link=link, out_dir=out_dir)
|
||||
|
||||
link.history[method_name].append(result)
|
||||
|
||||
stats[result.status] += 1
|
||||
log_archive_method_finished(result)
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
patch_main_index(link)
|
||||
|
||||
# # If any changes were made, update the main links index json and html
|
||||
# was_changed = stats['succeeded'] or stats['failed']
|
||||
# if was_changed:
|
||||
# patch_main_index(link)
|
||||
|
||||
log_link_archiving_finished(link, link.link_dir, is_new, stats)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
except:
|
||||
pass
|
||||
raise
|
||||
|
||||
except Exception as err:
|
||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||
raise
|
||||
|
||||
return link
|
||||
|
||||
|
||||
### Archive Method Functions
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
if link.title and not link.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
return SAVE_TITLE
|
||||
|
||||
@enforce_types
|
||||
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
link.url,
|
||||
'|',
|
||||
'grep',
|
||||
'<title',
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output = fetch_page_title(link.url, timeout=timeout, progress=False)
|
||||
if not output:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
|
||||
return False
|
||||
|
||||
return SAVE_FAVICON
|
||||
|
||||
@enforce_types
|
||||
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--output', str(output),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
output_path = wget_output_path(link)
|
||||
out_dir = out_dir or link.link_dir
|
||||
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
|
||||
return False
|
||||
|
||||
return SAVE_WGET
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using wget"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
if SAVE_WARC:
|
||||
warc_dir = os.path.join(out_dir, 'warc')
|
||||
os.makedirs(warc_dir, exist_ok=True)
|
||||
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
||||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
'--restrict-file-names=windows',
|
||||
'--timeout={}'.format(timeout),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
|
||||
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
|
||||
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
|
||||
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
|
||||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
output = wget_output_path(link)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
if line.strip()
|
||||
]
|
||||
files_downloaded = (
|
||||
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
||||
if 'Downloaded:' in output_tail[-1]
|
||||
else 0
|
||||
)
|
||||
|
||||
# Check for common failure cases
|
||||
if result.returncode > 0 and files_downloaded < 1:
|
||||
hints = (
|
||||
'Got wget response code: {}.'.format(result.returncode),
|
||||
*output_tail,
|
||||
)
|
||||
if b'403: Forbidden' in result.stderr:
|
||||
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
|
||||
if b'404: Not Found' in result.stderr:
|
||||
raise ArchiveError('404 Not Found', hints)
|
||||
if b'ERROR 500: Internal Server Error' in result.stderr:
|
||||
raise ArchiveError('500 Internal Server Error', hints)
|
||||
raise ArchiveError('Got an error from the server', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=WGET_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
|
||||
return False
|
||||
|
||||
return SAVE_PDF
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'output.pdf'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--print-to-pdf',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save PDF', hints)
|
||||
|
||||
chmod_file('output.pdf', cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
|
||||
return False
|
||||
|
||||
return SAVE_SCREENSHOT
|
||||
|
||||
@enforce_types
|
||||
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'screenshot.png'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--screenshot',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save screenshot', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.html')):
|
||||
return False
|
||||
|
||||
return SAVE_DOM
|
||||
|
||||
@enforce_types
|
||||
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'output.html'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--dump-dom',
|
||||
link.url
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
with open(output_path, 'w+') as f:
|
||||
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = result.stderr.decode()
|
||||
raise ArchiveError('Failed to save DOM', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'git')):
|
||||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
(domain(link.url) in GIT_DOMAINS)
|
||||
or (extension(link.url) == 'git')
|
||||
)
|
||||
if not is_clonable_url:
|
||||
return False
|
||||
|
||||
return SAVE_GIT
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using git"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'git'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
GIT_BINARY,
|
||||
'clone',
|
||||
'--mirror',
|
||||
'--recursive',
|
||||
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
|
||||
without_query(without_fragment(link.url)),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
pass
|
||||
elif result.returncode > 0:
|
||||
hints = 'Got git response code: {}.'.format(result.returncode)
|
||||
raise ArchiveError('Failed to save git clone', hints)
|
||||
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=GIT_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'media')):
|
||||
return False
|
||||
|
||||
return SAVE_MEDIA
|
||||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'media'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--yes-playlist',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--no-check-certificate',
|
||||
'--user-agent',
|
||||
'--all-subs',
|
||||
'--extract-audio',
|
||||
'--keep-video',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--audio-format', 'mp3',
|
||||
'--audio-quality', '320K',
|
||||
'--embed-thumbnail',
|
||||
'--add-metadata',
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=YOUTUBEDL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
|
||||
# if open(path, 'r').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
return SAVE_ARCHIVE_DOT_ORG
|
||||
|
||||
@enforce_types
|
||||
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--location',
|
||||
'--head',
|
||||
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
||||
'--max-time', str(timeout),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
|
||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||
if content_location:
|
||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||
archive_org_url = None
|
||||
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
|
||||
elif errors:
|
||||
raise ArchiveError(', '.join(errors))
|
||||
else:
|
||||
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
if output and not isinstance(output, Exception):
|
||||
# instead of writing None when archive.org rejects the url write the
|
||||
# url to resubmit it to archive.org. This is so when the user visits
|
||||
# the URL in person, it will attempt to re-archive it, and it'll show the
|
||||
# nicer error message explaining why the url was rejected if it fails.
|
||||
archive_org_url = archive_org_url or submit_url
|
||||
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
|
||||
f.write(archive_org_url)
|
||||
chmod_file('archive.org.txt', cwd=out_dir)
|
||||
output = archive_org_url
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
|
||||
# Parse archive.org response headers
|
||||
headers: Dict[str, List[str]] = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
content_location = headers['content-location']
|
||||
errors = headers['x-archive-wayback-runtime-error']
|
||||
return content_location, errors
|
|
@ -1,626 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from typing import Dict, List, Optional, Iterable
|
||||
from itertools import chain
|
||||
|
||||
from .schema import Link
|
||||
from .util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
get_dir_size,
|
||||
human_readable_size,
|
||||
)
|
||||
from .index import (
|
||||
links_after_timestamp,
|
||||
load_main_index,
|
||||
import_new_links,
|
||||
write_main_index,
|
||||
)
|
||||
from .storage.json import (
|
||||
parse_json_main_index,
|
||||
parse_json_link_details,
|
||||
parse_json_links_details,
|
||||
)
|
||||
from .storage.sql import parse_sql_main_index, get_admins
|
||||
from .storage.html import parse_html_main_index
|
||||
from .archive_methods import archive_link
|
||||
from .config import (
|
||||
stderr,
|
||||
ANSI,
|
||||
ONLY_NEW,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR,
|
||||
ARCHIVE_DIR,
|
||||
LOGS_DIR,
|
||||
CONFIG_FILE,
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
STATIC_DIR_NAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
check_dependencies,
|
||||
check_data_folder,
|
||||
setup_django,
|
||||
write_config_file,
|
||||
)
|
||||
from .logs import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
log_removal_started,
|
||||
log_removal_finished,
|
||||
log_list_started,
|
||||
log_list_finished,
|
||||
)
|
||||
|
||||
|
||||
ALLOWED_IN_OUTPUT_DIR = {
|
||||
'.DS_Store',
|
||||
'.venv',
|
||||
'venv',
|
||||
'virtualenv',
|
||||
'.virtualenv',
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
STATIC_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
}
|
||||
|
||||
|
||||
@enforce_types
|
||||
def init():
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
|
||||
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
|
||||
|
||||
if is_empty and not existing_index:
|
||||
print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
|
||||
print(f' {OUTPUT_DIR}')
|
||||
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
elif existing_index:
|
||||
print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
|
||||
print(f' {OUTPUT_DIR}')
|
||||
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
else:
|
||||
stderr(
|
||||
("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
|
||||
" You must run init in a completely empty directory, or an existing data folder.\n\n"
|
||||
" {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
|
||||
" then run and run 'archivebox init' to pick up where you left off.\n\n"
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||
).format(OUTPUT_DIR, **ANSI)
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
if existing_index:
|
||||
print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
|
||||
else:
|
||||
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
|
||||
|
||||
os.makedirs(SOURCES_DIR, exist_ok=True)
|
||||
print(f' √ {SOURCES_DIR}')
|
||||
|
||||
os.makedirs(ARCHIVE_DIR, exist_ok=True)
|
||||
print(f' √ {ARCHIVE_DIR}')
|
||||
|
||||
os.makedirs(LOGS_DIR, exist_ok=True)
|
||||
print(f' √ {LOGS_DIR}')
|
||||
|
||||
write_config_file({}, out_dir=OUTPUT_DIR)
|
||||
print(f' √ {CONFIG_FILE}')
|
||||
|
||||
if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)):
|
||||
print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
|
||||
else:
|
||||
print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
|
||||
|
||||
setup_django(OUTPUT_DIR, check_db=False)
|
||||
from django.conf import settings
|
||||
assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
|
||||
print(f' √ {settings.DATABASE_FILE}')
|
||||
print()
|
||||
from .storage.sql import apply_migrations
|
||||
for migration_line in apply_migrations(OUTPUT_DIR):
|
||||
print(f' {migration_line}')
|
||||
|
||||
|
||||
assert os.path.exists(settings.DATABASE_FILE)
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
||||
# print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
|
||||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
print()
|
||||
print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI))
|
||||
|
||||
all_links = {}
|
||||
if existing_index:
|
||||
all_links = {
|
||||
link.url: link
|
||||
for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
|
||||
}
|
||||
print(' √ Loaded {} links from existing main index...'.format(len(all_links)))
|
||||
|
||||
orphaned_json_links = {
|
||||
link.url: link
|
||||
for link in parse_json_main_index(OUTPUT_DIR)
|
||||
if link.url not in all_links
|
||||
}
|
||||
if orphaned_json_links:
|
||||
all_links.update(orphaned_json_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
||||
|
||||
orphaned_sql_links = {
|
||||
link.url: link
|
||||
for link in parse_sql_main_index(OUTPUT_DIR)
|
||||
if link.url not in all_links
|
||||
}
|
||||
if orphaned_sql_links:
|
||||
all_links.update(orphaned_sql_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
|
||||
|
||||
orphaned_data_dir_links = {
|
||||
link.url: link
|
||||
for link in parse_json_links_details(OUTPUT_DIR)
|
||||
}
|
||||
orphan_new_links = {
|
||||
url: link
|
||||
for url, link in orphaned_data_dir_links.items()
|
||||
if url not in all_links
|
||||
}
|
||||
orphan_duplicates = {
|
||||
url: link
|
||||
for url, link in orphaned_data_dir_links.items()
|
||||
if url in all_links
|
||||
}
|
||||
if orphan_new_links:
|
||||
all_links.update(orphan_new_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
|
||||
if orphan_duplicates:
|
||||
print(' {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
|
||||
|
||||
orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
|
||||
invalid_folders = {
|
||||
folder: link
|
||||
for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
|
||||
if folder not in orphaned_data_dirs
|
||||
}
|
||||
if invalid_folders:
|
||||
print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
|
||||
|
||||
if orphan_duplicates or invalid_folders:
|
||||
print(' For more information about the link data directories that were skipped, run:')
|
||||
print(' archivebox info')
|
||||
print(' archivebox list --status=invalid')
|
||||
print(' archivebox list --status=orphaned')
|
||||
print(' archivebox list --status=duplicate')
|
||||
|
||||
|
||||
write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
|
||||
|
||||
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
if existing_index:
|
||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||
else:
|
||||
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
|
||||
print()
|
||||
print(' To view your archive index, open:')
|
||||
print(' {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" archivebox add 'https://example.com'")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def info():
|
||||
|
||||
print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
|
||||
print(f' {OUTPUT_DIR}/*')
|
||||
num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
|
||||
size = human_readable_size(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files')
|
||||
print()
|
||||
|
||||
links = list(load_main_index(out_dir=OUTPUT_DIR))
|
||||
num_json_links = len(links)
|
||||
num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
|
||||
num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
|
||||
users = get_admins().values_list('username', flat=True)
|
||||
print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
|
||||
print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
|
||||
|
||||
print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
|
||||
|
||||
if num_html_links != len(links) or num_sql_links != len(links):
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
|
||||
print(' archivebox init')
|
||||
|
||||
if not users:
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
|
||||
print(' archivebox manage createsuperuser')
|
||||
|
||||
print()
|
||||
print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
|
||||
print(f' {ARCHIVE_DIR}/*')
|
||||
|
||||
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
|
||||
size = human_readable_size(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print()
|
||||
|
||||
num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
|
||||
num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
|
||||
num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
|
||||
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
|
||||
|
||||
num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
|
||||
num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
|
||||
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
|
||||
|
||||
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
|
||||
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
|
||||
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
|
||||
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
|
||||
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
|
||||
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
|
||||
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
|
||||
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
|
||||
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
|
||||
|
||||
if num_indexed:
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
|
||||
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
|
||||
|
||||
if orphaned:
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
|
||||
print(' archivebox init')
|
||||
|
||||
if num_invalid:
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
|
||||
print(' archivebox init')
|
||||
|
||||
print()
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update_archive_data(import_path: Optional[str]=None,
|
||||
resume: Optional[float]=None,
|
||||
only_new: bool=False,
|
||||
index_only: bool=False) -> List[Link]:
|
||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||
|
||||
check_dependencies()
|
||||
check_data_folder()
|
||||
|
||||
# Step 1: Load list of links from the existing index
|
||||
# merge in and dedupe new links from import_path
|
||||
all_links: List[Link] = []
|
||||
new_links: List[Link] = []
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
if import_path:
|
||||
all_links, new_links = import_new_links(all_links, import_path)
|
||||
|
||||
# Step 2: Write updated index with deduped old and new links back to disk
|
||||
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
|
||||
|
||||
if index_only:
|
||||
return all_links
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
links = new_links if ONLY_NEW else all_links
|
||||
log_archiving_started(len(links), resume)
|
||||
idx: int = 0
|
||||
link: Link = None # type: ignore
|
||||
try:
|
||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||
archive_link(link, out_dir=link.link_dir)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
|
||||
raise SystemExit(0)
|
||||
|
||||
except:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
|
||||
return all_links
|
||||
|
||||
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
|
||||
'substring': lambda link, pattern: pattern in link.url,
|
||||
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
|
||||
'domain': lambda link, pattern: link.domain == pattern,
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
|
||||
for pattern in filter_patterns:
|
||||
if LINK_FILTERS[filter_type](link, pattern):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
@enforce_types
|
||||
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
|
||||
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
|
||||
for link in all_links:
|
||||
if after is not None and float(link.timestamp) < after:
|
||||
continue
|
||||
if before is not None and float(link.timestamp) > before:
|
||||
continue
|
||||
|
||||
if filter_patterns:
|
||||
if link_matches_filter(link, filter_patterns, filter_type):
|
||||
yield link
|
||||
else:
|
||||
yield link
|
||||
|
||||
|
||||
@enforce_types
|
||||
def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
|
||||
after: Optional[float]=None, before: Optional[float]=None,
|
||||
yes: bool=False, delete: bool=False) -> List[Link]:
|
||||
|
||||
check_dependencies()
|
||||
check_data_folder()
|
||||
|
||||
log_list_started(filter_patterns, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
links = list(list_archive_data(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
after=after,
|
||||
before=before,
|
||||
))
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
if not len(links):
|
||||
log_removal_finished(0, 0)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
log_list_finished(links)
|
||||
log_removal_started(links, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
to_keep = []
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
for link in all_links:
|
||||
should_remove = (
|
||||
(after is not None and float(link.timestamp) < after)
|
||||
or (before is not None and float(link.timestamp) > before)
|
||||
or link_matches_filter(link, filter_patterns, filter_type)
|
||||
)
|
||||
if not should_remove:
|
||||
to_keep.append(link)
|
||||
elif should_remove and delete:
|
||||
shutil.rmtree(link.link_dir)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
|
||||
log_removal_finished(len(all_links), len(to_keep))
|
||||
|
||||
return to_keep
|
||||
|
||||
|
||||
|
||||
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
}
|
||||
|
||||
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
}
|
||||
|
||||
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
}
|
||||
|
||||
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that are expected to exist based on the main index"""
|
||||
all_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_folders[entry.path] = link
|
||||
|
||||
return all_folders
|
||||
|
||||
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_valid, links)
|
||||
}
|
||||
|
||||
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
|
||||
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
|
||||
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
|
||||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||
|
||||
|
||||
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
||||
links = list(links)
|
||||
by_url = {link.url: 0 for link in links}
|
||||
by_timestamp = {link.timestamp: 0 for link in links}
|
||||
|
||||
duplicate_folders = {}
|
||||
|
||||
indexed_folders = {link.link_dir for link in links}
|
||||
data_folders = (
|
||||
entry.path
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
|
||||
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
|
||||
)
|
||||
|
||||
for path in chain(sorted(indexed_folders), sorted(data_folders)):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link:
|
||||
# link folder has same timestamp as different link folder
|
||||
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
|
||||
if by_timestamp[link.timestamp] > 1:
|
||||
duplicate_folders[path] = link
|
||||
|
||||
# link folder has same url as different link folder
|
||||
by_url[link.url] = by_url.get(link.url, 0) + 1
|
||||
if by_url[link.url] > 1:
|
||||
duplicate_folders[path] = link
|
||||
|
||||
return duplicate_folders
|
||||
|
||||
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||
links = list(links)
|
||||
indexed_folders = {link.link_dir: link for link in links}
|
||||
orphaned_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if index_exists and entry.path not in indexed_folders:
|
||||
# folder is a valid link data dir with index details, but it's not in the main index
|
||||
orphaned_folders[entry.path] = link
|
||||
|
||||
return orphaned_folders
|
||||
|
||||
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_corrupt, links)
|
||||
}
|
||||
|
||||
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||
by_timestamp = {link.timestamp: 0 for link in links}
|
||||
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if index_exists and link is None:
|
||||
# index exists but it's corrupted or unparseable
|
||||
unrecognized_folders[entry.path] = link
|
||||
|
||||
elif not index_exists:
|
||||
# link details index doesn't exist and the folder isn't in the main index
|
||||
timestamp = entry.path.rsplit('/', 1)[-1]
|
||||
if timestamp not in by_timestamp:
|
||||
unrecognized_folders[entry.path] = link
|
||||
|
||||
return unrecognized_folders
|
||||
|
||||
|
||||
def is_valid(link: Link) -> bool:
|
||||
dir_exists = os.path.exists(link.link_dir)
|
||||
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
|
||||
if not dir_exists:
|
||||
# unarchived links are not included in the valid list
|
||||
return False
|
||||
if dir_exists and not index_exists:
|
||||
return False
|
||||
if dir_exists and index_exists:
|
||||
try:
|
||||
parsed_link = parse_json_link_details(link.link_dir)
|
||||
return link.url == parsed_link.url
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def is_corrupt(link: Link) -> bool:
|
||||
if not os.path.exists(link.link_dir):
|
||||
# unarchived links are not considered corrupt
|
||||
return False
|
||||
|
||||
if is_valid(link):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_archived(link: Link) -> bool:
|
||||
return is_valid(link) and link.is_archived
|
||||
|
||||
def is_unarchived(link: Link) -> bool:
|
||||
if not os.path.exists(link.link_dir):
|
||||
return True
|
||||
return not link.is_archived
|
|
@ -1,10 +0,0 @@
|
|||
[mypy_django_plugin]
|
||||
|
||||
# specify settings module to use for django.conf.settings, this setting
|
||||
# could also be specified with DJANGO_SETTINGS_MODULE environment variable
|
||||
# (it also takes priority over config file)
|
||||
django_settings = core.settings
|
||||
|
||||
# if True, all unknown settings in django.conf.settings will fallback to Any,
|
||||
# specify it if your settings are loaded dynamically to avoid false positives
|
||||
ignore_missing_settings = True
|
|
@ -1,331 +0,0 @@
|
|||
"""
|
||||
Everything related to parsing links from input sources.
|
||||
|
||||
For a list of supported services, see the README.md.
|
||||
For examples of supported import formats see tests/.
|
||||
|
||||
Link: {
|
||||
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
||||
'timestamp': '1544212312.4234',
|
||||
'title': 'Example.com Page Title',
|
||||
'tags': 'abc,def',
|
||||
'sources': [
|
||||
'output/sources/ril_export.html',
|
||||
'output/sources/getpocket.com-1523422111.txt',
|
||||
'output/sources/stdin-234234112312.txt'
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
from typing import Tuple, List, IO, Iterable
|
||||
from datetime import datetime
|
||||
import xml.etree.ElementTree as etree
|
||||
|
||||
from .config import TIMEOUT
|
||||
from .util import (
|
||||
htmldecode,
|
||||
str_between,
|
||||
URL_REGEX,
|
||||
check_url_parsing_invariants,
|
||||
TimedProgress,
|
||||
Link,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_links(source_file: str) -> Tuple[List[Link], str]:
|
||||
"""parse a list of URLs with their metadata from an
|
||||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
||||
check_url_parsing_invariants()
|
||||
PARSERS = (
|
||||
# Specialized parsers
|
||||
('Pocket HTML', parse_pocket_html_export),
|
||||
('Pinboard RSS', parse_pinboard_rss_export),
|
||||
('Shaarli RSS', parse_shaarli_rss_export),
|
||||
('Medium RSS', parse_medium_rss_export),
|
||||
|
||||
# General parsers
|
||||
('Netscape HTML', parse_netscape_html_export),
|
||||
('Generic RSS', parse_rss_export),
|
||||
('Generic JSON', parse_json_export),
|
||||
|
||||
# Fallback parser
|
||||
('Plain Text', parse_plain_text_export),
|
||||
)
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
for parser_name, parser_func in PARSERS:
|
||||
try:
|
||||
links = list(parser_func(file))
|
||||
if links:
|
||||
timer.end()
|
||||
return links, parser_name
|
||||
except Exception as err: # noqa
|
||||
# Parsers are tried one by one down the list, and the first one
|
||||
# that succeeds is used. To see why a certain parser was not used
|
||||
# due to error or format incompatibility, uncomment this line:
|
||||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||
pass
|
||||
|
||||
timer.end()
|
||||
return [], 'Failed to parse'
|
||||
|
||||
|
||||
### Import Parser Functions
|
||||
|
||||
@enforce_types
|
||||
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
tags = match.group(3)
|
||||
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=tags or '',
|
||||
sources=[html_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
||||
json_file.seek(0)
|
||||
links = json.load(json_file)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now().timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(link.get('tags')) or '',
|
||||
sources=[json_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Shaarli-specific RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
entries = rss_file.read().split('<entry>')[1:]
|
||||
for entry in entries:
|
||||
# example entry:
|
||||
# <entry>
|
||||
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
|
||||
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
|
||||
# <id>https://demo.shaarli.org/?cEV4vw</id>
|
||||
# <published>2019-01-30T06:06:01+00:00</published>
|
||||
# <updated>2019-01-30T06:06:01+00:00</updated>
|
||||
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
|
||||
# </entry>
|
||||
|
||||
trailing_removed = entry.split('</entry>', 1)[0]
|
||||
leading_removed = trailing_removed.strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||
|
||||
title = str_between(get_row('title'), '<title>', '</title>').strip()
|
||||
url = str_between(get_row('link'), '<link href="', '" />')
|
||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
|
||||
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1)
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
title = match.group(3).strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[html_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = etree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Medium RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = etree.parse(rss_file).getroot()
|
||||
items = root.find("channel").findall("item") # type: ignore
|
||||
for item in items:
|
||||
url = item.find("link").text # type: ignore
|
||||
title = item.find("title").text.strip() # type: ignore
|
||||
ts_str = item.find("pubDate").text # type: ignore
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse raw links from each line in a text file"""
|
||||
|
||||
text_file.seek(0)
|
||||
for line in text_file.readlines():
|
||||
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
||||
for url in urls: # type: ignore
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
|
@ -1,89 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
from argparse import ArgumentParser
|
||||
from os.path import exists, join
|
||||
from shutil import rmtree
|
||||
from typing import List
|
||||
|
||||
from .config import ARCHIVE_DIR, OUTPUT_DIR
|
||||
from .index import (
|
||||
parse_json_links_index,
|
||||
write_html_links_index,
|
||||
write_json_links_index,
|
||||
)
|
||||
|
||||
|
||||
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
|
||||
if not exists(join(OUTPUT_DIR, 'index.json')):
|
||||
exit('index.json is missing; nothing to do')
|
||||
|
||||
compiled = [re.compile(r) for r in regexes]
|
||||
links = parse_json_links_index(OUTPUT_DIR)
|
||||
filtered = []
|
||||
remaining = []
|
||||
|
||||
for link in links:
|
||||
url = link.url
|
||||
for r in compiled:
|
||||
if r.search(url):
|
||||
filtered.append((link, r))
|
||||
break
|
||||
else:
|
||||
remaining.append(link)
|
||||
|
||||
if not filtered:
|
||||
exit('Search did not match any entries.')
|
||||
|
||||
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
|
||||
|
||||
for link, regex in filtered:
|
||||
url = link.url
|
||||
print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
|
||||
|
||||
if not proceed:
|
||||
answer = input('Remove {} entries from index? [y/n] '.format(
|
||||
len(filtered)))
|
||||
proceed = answer.strip().lower() in ('y', 'yes')
|
||||
|
||||
if not proceed:
|
||||
exit('Aborted')
|
||||
|
||||
write_json_links_index(OUTPUT_DIR, remaining)
|
||||
write_html_links_index(OUTPUT_DIR, remaining)
|
||||
|
||||
if delete:
|
||||
for link, _ in filtered:
|
||||
data_dir = join(ARCHIVE_DIR, link['timestamp'])
|
||||
if exists(data_dir):
|
||||
rmtree(data_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
p = ArgumentParser('Index purging tool')
|
||||
p.add_argument(
|
||||
'--regex',
|
||||
'-r',
|
||||
action='append',
|
||||
help='Regular expression matching URLs to purge',
|
||||
)
|
||||
p.add_argument(
|
||||
'--delete',
|
||||
'-d',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Delete webpage files from archive',
|
||||
)
|
||||
p.add_argument(
|
||||
'--yes',
|
||||
'-y',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Do not prompt for confirmation',
|
||||
)
|
||||
|
||||
args = p.parse_args()
|
||||
if args.regex:
|
||||
cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
|
||||
else:
|
||||
p.print_help()
|
|
@ -1 +0,0 @@
|
|||
__package__ = 'archivebox.legacy.storage'
|
1086
archivebox/main.py
Normal file
68
archivebox/parsers/__init__.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
"""
|
||||
Everything related to parsing links from input sources.
|
||||
|
||||
For a list of supported services, see the README.md.
|
||||
For examples of supported import formats see tests/.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import Tuple, List
|
||||
|
||||
from ..config import TIMEOUT
|
||||
from ..util import (
|
||||
check_url_parsing_invariants,
|
||||
TimedProgress,
|
||||
Link,
|
||||
enforce_types,
|
||||
)
|
||||
from .pocket_html import parse_pocket_html_export
|
||||
from .pinboard_rss import parse_pinboard_rss_export
|
||||
from .shaarli_rss import parse_shaarli_rss_export
|
||||
from .medium_rss import parse_medium_rss_export
|
||||
from .netscape_html import parse_netscape_html_export
|
||||
from .generic_rss import parse_generic_rss_export
|
||||
from .generic_json import parse_generic_json_export
|
||||
from .generic_txt import parse_generic_txt_export
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_links(source_file: str) -> Tuple[List[Link], str]:
|
||||
"""parse a list of URLs with their metadata from an
|
||||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
||||
check_url_parsing_invariants()
|
||||
PARSERS = (
|
||||
# Specialized parsers
|
||||
('Pocket HTML', parse_pocket_html_export),
|
||||
('Pinboard RSS', parse_pinboard_rss_export),
|
||||
('Shaarli RSS', parse_shaarli_rss_export),
|
||||
('Medium RSS', parse_medium_rss_export),
|
||||
|
||||
# General parsers
|
||||
('Netscape HTML', parse_netscape_html_export),
|
||||
('Generic RSS', parse_generic_rss_export),
|
||||
('Generic JSON', parse_generic_json_export),
|
||||
|
||||
# Fallback parser
|
||||
('Plain Text', parse_generic_txt_export),
|
||||
)
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
for parser_name, parser_func in PARSERS:
|
||||
try:
|
||||
links = list(parser_func(file))
|
||||
if links:
|
||||
timer.end()
|
||||
return links, parser_name
|
||||
except Exception as err: # noqa
|
||||
# Parsers are tried one by one down the list, and the first one
|
||||
# that succeeds is used. To see why a certain parser was not used
|
||||
# due to error or format incompatibility, uncomment this line:
|
||||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||
pass
|
||||
|
||||
timer.end()
|
||||
return [], 'Failed to parse'
|
65
archivebox/parsers/generic_json.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import json
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
||||
json_file.seek(0)
|
||||
links = json.load(json_file)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now().timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(link.get('tags')) or '',
|
||||
sources=[json_file.name],
|
||||
)
|
49
archivebox/parsers/generic_rss.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
30
archivebox/parsers/generic_txt.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
__description__ = 'Plain Text'
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse raw links from each line in a text file"""
|
||||
|
||||
text_file.seek(0)
|
||||
for line in text_file.readlines():
|
||||
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
||||
for url in urls: # type: ignore
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
35
archivebox/parsers/medium_rss.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Medium RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.find("channel").findall("item") # type: ignore
|
||||
for item in items:
|
||||
url = item.find("link").text # type: ignore
|
||||
title = item.find("title").text.strip() # type: ignore
|
||||
ts_str = item.find("pubDate").text # type: ignore
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
39
archivebox/parsers/netscape_html.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
|
||||
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1)
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
title = match.group(3).strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[html_file.name],
|
||||
)
|
||||
|
47
archivebox/parsers/pinboard_rss.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
)
|
38
archivebox/parsers/pocket_html.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
tags = match.group(3)
|
||||
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=tags or '',
|
||||
sources=[html_file.name],
|
||||
)
|
50
archivebox/parsers/shaarli_rss.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Shaarli-specific RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
entries = rss_file.read().split('<entry>')[1:]
|
||||
for entry in entries:
|
||||
# example entry:
|
||||
# <entry>
|
||||
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
|
||||
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
|
||||
# <id>https://demo.shaarli.org/?cEV4vw</id>
|
||||
# <published>2019-01-30T06:06:01+00:00</published>
|
||||
# <updated>2019-01-30T06:06:01+00:00</updated>
|
||||
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
|
||||
# </entry>
|
||||
|
||||
trailing_removed = entry.split('</entry>', 1)[0]
|
||||
leading_removed = trailing_removed.strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||
|
||||
title = str_between(get_row('title'), '<title>', '</title>').strip()
|
||||
url = str_between(get_row('link'), '<link href="', '" />')
|
||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 158 B After Width: | Height: | Size: 158 B |
Before Width: | Height: | Size: 201 B After Width: | Height: | Size: 201 B |
Before Width: | Height: | Size: 157 B After Width: | Height: | Size: 157 B |
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import ssl
|
||||
import json
|
||||
import time
|
||||
import shutil
|
||||
|
@ -8,7 +9,7 @@ import argparse
|
|||
|
||||
from string import Template
|
||||
from json import JSONEncoder
|
||||
from typing import List, Optional, Any, Union, IO, Mapping, Tuple
|
||||
from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
|
||||
from inspect import signature
|
||||
from functools import wraps
|
||||
from hashlib import sha256
|
||||
|
@ -28,11 +29,12 @@ from subprocess import (
|
|||
|
||||
from base32_crockford import encode as base32_encode # type: ignore
|
||||
|
||||
from .schema import Link
|
||||
from .index.schema import Link
|
||||
from .config import (
|
||||
ANSI,
|
||||
TERM_WIDTH,
|
||||
SOURCES_DIR,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR_NAME,
|
||||
OUTPUT_PERMISSIONS,
|
||||
TIMEOUT,
|
||||
SHOW_PROGRESS,
|
||||
|
@ -40,8 +42,9 @@ from .config import (
|
|||
CHECK_SSL_VALIDITY,
|
||||
WGET_USER_AGENT,
|
||||
CHROME_OPTIONS,
|
||||
check_data_folder,
|
||||
)
|
||||
from .logs import pretty_path
|
||||
from .cli.logging import pretty_path
|
||||
|
||||
### Parsing Helpers
|
||||
|
||||
|
@ -187,31 +190,36 @@ def check_url_parsing_invariants() -> None:
|
|||
### Random Helpers
|
||||
|
||||
@enforce_types
|
||||
def handle_stdin_import(raw_text: str) -> str:
|
||||
if not os.path.exists(SOURCES_DIR):
|
||||
os.makedirs(SOURCES_DIR)
|
||||
def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
|
||||
if not os.path.exists(sources_dir):
|
||||
os.makedirs(sources_dir)
|
||||
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
|
||||
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
|
||||
|
||||
atomic_write(raw_text, source_path)
|
||||
return source_path
|
||||
|
||||
|
||||
@enforce_types
|
||||
def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
|
||||
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
|
||||
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
if not os.path.exists(SOURCES_DIR):
|
||||
os.makedirs(SOURCES_DIR)
|
||||
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
|
||||
if not os.path.exists(sources_dir):
|
||||
os.makedirs(sources_dir)
|
||||
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
|
||||
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
|
||||
|
||||
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
|
||||
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
|
||||
print('{}[*] [{}] Downloading {}{}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
|
@ -532,7 +540,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
|
|||
if CHECK_SSL_VALIDITY:
|
||||
resp = urlopen(req, timeout=timeout)
|
||||
else:
|
||||
import ssl
|
||||
insecure = ssl._create_unverified_context()
|
||||
resp = urlopen(req, timeout=timeout, context=insecure)
|
||||
|
||||
|
@ -662,7 +669,7 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr
|
|||
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
||||
|
||||
|
||||
def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
|
||||
def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
|
||||
header: bool=True, ljust: int=0, separator: str=',') -> str:
|
||||
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
|
||||
|
||||
|
@ -677,6 +684,8 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
|
|||
|
||||
return '\n'.join((header_str, *row_strs))
|
||||
|
||||
def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
|
||||
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
|
||||
|
||||
@enforce_types
|
||||
def render_template(template_path: str, context: Mapping[str, str]) -> str:
|
||||
|
@ -713,11 +722,11 @@ def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
|
|||
os.remove(tmp_file)
|
||||
|
||||
|
||||
def reject_stdin(caller: str) -> None:
|
||||
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
|
||||
"""Tell the user they passed stdin to a command that doesn't accept it"""
|
||||
|
||||
if not sys.stdin.isatty():
|
||||
stdin_raw_text = sys.stdin.read().strip()
|
||||
if stdin and not stdin.isatty():
|
||||
stdin_raw_text = stdin.read().strip()
|
||||
if stdin_raw_text:
|
||||
print(
|
||||
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
|
||||
|
@ -731,9 +740,30 @@ def reject_stdin(caller: str) -> None:
|
|||
print()
|
||||
raise SystemExit(1)
|
||||
|
||||
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
|
||||
if stdin and not stdin.isatty():
|
||||
return stdin.read()
|
||||
return None
|
||||
|
||||
|
||||
def set_docstring(text: str):
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper_with_docstring(*args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
wrapper_with_docstring.__doc__ = text
|
||||
return wrapper_with_docstring
|
||||
return decorator
|
||||
|
||||
|
||||
class SmartFormatter(argparse.HelpFormatter):
|
||||
def _split_lines(self, text, width):
|
||||
if '\n' in text:
|
||||
return text.splitlines()
|
||||
return argparse.HelpFormatter._split_lines(self, text, width)
|
||||
|
||||
|
||||
class ArchiveError(Exception):
|
||||
def __init__(self, message, hints=None):
|
||||
super().__init__(message)
|
||||
self.hints = hints
|