ArchiveBox/archivebox/index/json.py

__package__ = 'archivebox.index'

import os
import sys
import json as pyjson
from pathlib import Path

from datetime import datetime
from typing import List, Optional, Iterator, Any, Union

from .schema import Link
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
    VERSION,
    OUTPUT_DIR,
    FOOTER_INFO,
    DEPENDENCIES,
    JSON_INDEX_FILENAME,
    ARCHIVE_DIR_NAME,
    ANSI
)


MAIN_INDEX_HEADER = {
    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
    'schema': 'archivebox.index.json',
    'copyright_info': FOOTER_INFO,
    'meta': {
        'project': 'ArchiveBox',
        'version': VERSION,
        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
        'website': 'https://ArchiveBox.io',
        'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
        'source': 'https://github.com/ArchiveBox/ArchiveBox',
        'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
        'dependencies': DEPENDENCIES,
    },
}

@enforce_types
def generate_json_index_from_links(links: List[Link], with_headers: bool):
    if with_headers:
        output = {
            **MAIN_INDEX_HEADER,
            'num_links': len(links),
            'updated': datetime.now(),
            'last_run_cmd': sys.argv,
            'links': links,
        }
    else:
        output = links
    return to_json(output, indent=4, sort_keys=True)


@enforce_types
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
    """parse an archive index json file and return the list of links"""

    index_path = Path(out_dir) / JSON_INDEX_FILENAME
    if index_path.exists():
        with open(index_path, 'r', encoding='utf-8') as f:
            links = pyjson.load(f)['links']
            for link_json in links:
                try:
                    yield Link.from_json(link_json)
                except KeyError:
                    try:
                        detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
                        yield parse_json_link_details(str(detail_index_path))
                    except KeyError: 
                        # as a last effort, try to guess the missing values out of existing ones
                        try:
                            yield Link.from_json(link_json, guess=True)
                        except KeyError:
                            print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
                            continue
    return ()

### Link Details Index

@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    """write a json file with some info about the link"""
    
    out_dir = out_dir or link.link_dir
    path = Path(out_dir) / JSON_INDEX_FILENAME
    atomic_write(str(path), link._asdict(extended=True))


@enforce_types
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
    """load the json link index from a given directory"""
    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
    if existing_index.exists():
        with open(existing_index, 'r', encoding='utf-8') as f:
            try:
                link_json = pyjson.load(f)
                return Link.from_json(link_json, guess)
            except pyjson.JSONDecodeError:
                pass
    return None


@enforce_types
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
    """read through all the archive data folders and return the parsed links"""

    for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
        if entry.is_dir(follow_symlinks=True):
            if (Path(entry.path) / 'index.json').exists():
                try:
                    link = parse_json_link_details(entry.path)
                except KeyError:
                    link = None
                if link:
                    yield link


### Helpers

class ExtendedEncoder(pyjson.JSONEncoder):
    """
    Extended json serializer that supports serializing several model
    fields and objects
    """

    def default(self, obj):
        cls_name = obj.__class__.__name__

        if hasattr(obj, '_asdict'):
            return obj._asdict()

        elif isinstance(obj, bytes):
            return obj.decode()

        elif isinstance(obj, datetime):
            return obj.isoformat()

        elif isinstance(obj, Exception):
            return '{}: {}'.format(obj.__class__.__name__, obj)

        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
            return tuple(obj)

        return pyjson.JSONEncoder.default(self, obj)


@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
    return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`__package__ = 'archivebox.index'`
add package headers 2019-04-17 22:00:54 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`import os`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`import sys`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`import json as pyjson`
feat: Fallback to link detail when there is an issue loading a link from main index 2020-07-22 14:22:00 -05:00			`from pathlib import Path`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`from datetime import datetime`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`from typing import List, Optional, Iterator, Any, Union`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
feat: move import 2020-11-28 13:11:15 -05:00			`from .schema import Link`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..system import atomic_write`
			`from ..util import enforce_types`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`from ..config import (`
			`VERSION,`
			`OUTPUT_DIR,`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`FOOTER_INFO,`
			`DEPENDENCIES,`
			`JSON_INDEX_FILENAME,`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`ARCHIVE_DIR_NAME,`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`ANSI`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`)`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`MAIN_INDEX_HEADER = {`
			`'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`'schema': 'archivebox.index.json',`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`'copyright_info': FOOTER_INFO,`
			`'meta': {`
			`'project': 'ArchiveBox',`
			`'version': VERSION,`
remove unused GIT_SHA config option 2021-02-15 20:42:33 -05:00			`'git_sha': VERSION, # not used anymore, but kept for backwards compatibility`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`'website': 'https://ArchiveBox.io',`
update urls to new repo path 2020-11-23 02:04:39 -05:00			`'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',`
			`'source': 'https://github.com/ArchiveBox/ArchiveBox',`
			`'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`'dependencies': DEPENDENCIES,`
			`},`
			`}`

refactor: Move indexing logic out of logging module 2020-11-28 12:28:39 -05:00			`@enforce_types`
			`def generate_json_index_from_links(links: List[Link], with_headers: bool):`
			`if with_headers:`
			`output = {`
			`**MAIN_INDEX_HEADER,`
			`'num_links': len(links),`
			`'updated': datetime.now(),`
			`'last_run_cmd': sys.argv,`
			`'links': links,`
			`}`
			`else:`
			`output = links`
			`return to_json(output, indent=4, sort_keys=True)`

better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`"""parse an archive index json file and return the list of links"""`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
Replaced os.path in json.py 2020-09-30 14:21:41 -05:00			`index_path = Path(out_dir) / JSON_INDEX_FILENAME`
			`if index_path.exists():`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`with open(index_path, 'r', encoding='utf-8') as f:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`links = pyjson.load(f)['links']`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`for link_json in links:`
feat: Fallback to link detail when there is an issue loading a link from main index 2020-07-22 14:22:00 -05:00			`try:`
			`yield Link.from_json(link_json)`
			`except KeyError:`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`try:`
refactor: Change path calculation to use pathlib in a better way 2020-07-23 10:22:36 -05:00			`detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`yield parse_json_link_details(str(detail_index_path))`
			`except KeyError:`
fix: Guess timestamps and add placeholders to support older indices 2020-07-24 09:24:52 -05:00			`# as a last effort, try to guess the missing values out of existing ones`
			`try:`
			`yield Link.from_json(link_json, guess=True)`
			`except KeyError:`
			`print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))`
			`continue`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`return ()`

			`### Link Details Index`

			`@enforce_types`
			`def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:`
			`"""write a json file with some info about the link"""`

			`out_dir = out_dir or link.link_dir`
Replaced os.path in json.py 2020-09-30 14:21:41 -05:00			`path = Path(out_dir) / JSON_INDEX_FILENAME`
			`atomic_write(str(path), link._asdict(extended=True))`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`"""load the json link index from a given directory"""`
Replaced os.path in json.py 2020-09-30 14:21:41 -05:00			`existing_index = Path(out_dir) / JSON_INDEX_FILENAME`
			`if existing_index.exists():`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`with open(existing_index, 'r', encoding='utf-8') as f:`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`try:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`link_json = pyjson.load(f)`
fix: Guess timestamps and add placeholders to support older indices 2020-07-24 09:24:52 -05:00			`return Link.from_json(link_json, guess)`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`except pyjson.JSONDecodeError:`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`pass`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`return None`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00
split up utils into separate files 2019-04-30 23:13:04 -04:00
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`"""read through all the archive data folders and return the parsed links"""`

Replaced os.path in json.py 2020-09-30 14:21:41 -05:00			`for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`if entry.is_dir(follow_symlinks=True):`
Replaced os.path in json.py 2020-09-30 14:21:41 -05:00			`if (Path(entry.path) / 'index.json').exists():`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`try:`
			`link = parse_json_link_details(entry.path)`
			`except KeyError:`
			`link = None`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`if link:`
			`yield link`
split up utils into separate files 2019-04-30 23:13:04 -04:00


			`### Helpers`

			`class ExtendedEncoder(pyjson.JSONEncoder):`
			`"""`
			`Extended json serializer that supports serializing several model`
			`fields and objects`
			`"""`

			`def default(self, obj):`
			`cls_name = obj.__class__.__name__`

			`if hasattr(obj, '_asdict'):`
			`return obj._asdict()`

			`elif isinstance(obj, bytes):`
			`return obj.decode()`

			`elif isinstance(obj, datetime):`
			`return obj.isoformat()`

			`elif isinstance(obj, Exception):`
			`return '{}: {}'.format(obj.__class__.__name__, obj)`

			`elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):`
			`return tuple(obj)`

			`return pyjson.JSONEncoder.default(self, obj)`


			`@enforce_types`
			`def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:`
			`return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)`