ArchiveBox/archivebox/index/json.py

__package__ = 'archivebox.index'

import os
import sys
import json as pyjson
from pathlib import Path

from datetime import datetime
from typing import List, Optional, Iterator, Any

from .schema import Link, ArchiveResult
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
    VERSION,
    OUTPUT_DIR,
    FOOTER_INFO,
    GIT_SHA,
    DEPENDENCIES,
    JSON_INDEX_FILENAME,
    ARCHIVE_DIR_NAME,
    ANSI
)


MAIN_INDEX_HEADER = {
    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
    'schema': 'archivebox.index.json',
    'copyright_info': FOOTER_INFO,
    'meta': {
        'project': 'ArchiveBox',
        'version': VERSION,
        'git_sha': GIT_SHA,
        'website': 'https://ArchiveBox.io',
        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
        'source': 'https://github.com/pirate/ArchiveBox',
        'issues': 'https://github.com/pirate/ArchiveBox/issues',
        'dependencies': DEPENDENCIES,
    },
}

### Main Links Index

@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
    """parse an archive index json file and return the list of links"""

    index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            links = pyjson.load(f)['links']
            for link_json in links:
                try:
                    yield Link.from_json(link_json)
                except KeyError:
                    try:
                        detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
                        yield parse_json_link_details(str(detail_index_path))
                    except KeyError: 
                        # as a last effort, try to guess the missing values out of existing ones
                        try:
                            yield Link.from_json(link_json, guess=True)
                        except KeyError:
                            print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
                            continue
    return ()

@enforce_types
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
    """write the json link index to a given path"""

    assert isinstance(links, List), 'Links must be a list, not a generator.'
    assert not links or isinstance(links[0].history, dict)
    assert not links or isinstance(links[0].sources, list)

    if links and links[0].history.get('title'):
        assert isinstance(links[0].history['title'][0], ArchiveResult)

    if links and links[0].sources:
        assert isinstance(links[0].sources[0], str)

    main_index_json = {
        **MAIN_INDEX_HEADER,
        'num_links': len(links),
        'updated': datetime.now(),
        'last_run_cmd': sys.argv,
        'links': links,
    }
    atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)


### Link Details Index

@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    """write a json file with some info about the link"""
    
    out_dir = out_dir or link.link_dir
    path = os.path.join(out_dir, JSON_INDEX_FILENAME)
    atomic_write(path, link._asdict(extended=True))


@enforce_types
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
    """load the json link index from a given directory"""
    existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
            try:
                link_json = pyjson.load(f)
                return Link.from_json(link_json, guess)
            except pyjson.JSONDecodeError:
                pass
    return None


@enforce_types
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
    """read through all the archive data folders and return the parsed links"""

    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
        if entry.is_dir(follow_symlinks=True):
            if os.path.exists(os.path.join(entry.path, 'index.json')):
                try:
                    link = parse_json_link_details(entry.path)
                except KeyError:
                    link = None
                if link:
                    yield link


### Helpers

class ExtendedEncoder(pyjson.JSONEncoder):
    """
    Extended json serializer that supports serializing several model
    fields and objects
    """

    def default(self, obj):
        cls_name = obj.__class__.__name__

        if hasattr(obj, '_asdict'):
            return obj._asdict()

        elif isinstance(obj, bytes):
            return obj.decode()

        elif isinstance(obj, datetime):
            return obj.isoformat()

        elif isinstance(obj, Exception):
            return '{}: {}'.format(obj.__class__.__name__, obj)

        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
            return tuple(obj)

        return pyjson.JSONEncoder.default(self, obj)


@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
    return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`__package__ = 'archivebox.index'`
add package headers 2019-04-17 22:00:54 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`import os`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`import sys`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`import json as pyjson`
feat: Fallback to link detail when there is an issue loading a link from main index 2020-07-22 14:22:00 -05:00			`from pathlib import Path`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`from datetime import datetime`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from typing import List, Optional, Iterator, Any`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`from .schema import Link, ArchiveResult`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..system import atomic_write`
			`from ..util import enforce_types`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`from ..config import (`
			`VERSION,`
			`OUTPUT_DIR,`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`FOOTER_INFO,`
			`GIT_SHA,`
			`DEPENDENCIES,`
			`JSON_INDEX_FILENAME,`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`ARCHIVE_DIR_NAME,`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`ANSI`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`)`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`MAIN_INDEX_HEADER = {`
			`'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`'schema': 'archivebox.index.json',`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`'copyright_info': FOOTER_INFO,`
			`'meta': {`
			`'project': 'ArchiveBox',`
			`'version': VERSION,`
			`'git_sha': GIT_SHA,`
			`'website': 'https://ArchiveBox.io',`
			`'docs': 'https://github.com/pirate/ArchiveBox/wiki',`
			`'source': 'https://github.com/pirate/ArchiveBox',`
			`'issues': 'https://github.com/pirate/ArchiveBox/issues',`
			`'dependencies': DEPENDENCIES,`
			`},`
			`}`

better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`### Main Links Index`

			`@enforce_types`
			`def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`"""parse an archive index json file and return the list of links"""`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`if os.path.exists(index_path):`
			`with open(index_path, 'r', encoding='utf-8') as f:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`links = pyjson.load(f)['links']`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`for link_json in links:`
feat: Fallback to link detail when there is an issue loading a link from main index 2020-07-22 14:22:00 -05:00			`try:`
			`yield Link.from_json(link_json)`
			`except KeyError:`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`try:`
refactor: Change path calculation to use pathlib in a better way 2020-07-23 10:22:36 -05:00			`detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`yield parse_json_link_details(str(detail_index_path))`
			`except KeyError:`
fix: Guess timestamps and add placeholders to support older indices 2020-07-24 09:24:52 -05:00			`# as a last effort, try to guess the missing values out of existing ones`
			`try:`
			`yield Link.from_json(link_json, guess=True)`
			`except KeyError:`
			`print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))`
			`continue`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`return ()`

			`@enforce_types`
			`def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:`
			`"""write the json link index to a given path"""`

			`assert isinstance(links, List), 'Links must be a list, not a generator.'`
			`assert not links or isinstance(links[0].history, dict)`
			`assert not links or isinstance(links[0].sources, list)`

			`if links and links[0].history.get('title'):`
			`assert isinstance(links[0].history['title'][0], ArchiveResult)`

			`if links and links[0].sources:`
			`assert isinstance(links[0].sources[0], str)`

add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`main_index_json = {`
			`**MAIN_INDEX_HEADER,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'num_links': len(links),`
			`'updated': datetime.now(),`
add archivebox info command to scan data dir 2019-04-22 14:34:30 -04:00			`'last_run_cmd': sys.argv,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'links': links,`
			`}`
Merge branch 'master' into django 2020-06-25 21:30:29 -04:00			`atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`### Link Details Index`

			`@enforce_types`
			`def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:`
			`"""write a json file with some info about the link"""`

			`out_dir = out_dir or link.link_dir`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`path = os.path.join(out_dir, JSON_INDEX_FILENAME)`
Merge branch 'master' into django 2020-06-25 21:30:29 -04:00			`atomic_write(path, link._asdict(extended=True))`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`@enforce_types`
fix: Guess timestamps and add placeholders to support older indices 2020-07-24 09:24:52 -05:00			`def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`"""load the json link index from a given directory"""`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`if os.path.exists(existing_index):`
			`with open(existing_index, 'r', encoding='utf-8') as f:`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`try:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`link_json = pyjson.load(f)`
fix: Guess timestamps and add placeholders to support older indices 2020-07-24 09:24:52 -05:00			`return Link.from_json(link_json, guess)`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`except pyjson.JSONDecodeError:`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`pass`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`return None`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00
split up utils into separate files 2019-04-30 23:13:04 -04:00
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`@enforce_types`
			`def parse_json_links_details(out_dir: str) -> Iterator[Link]:`
			`"""read through all the archive data folders and return the parsed links"""`

			`for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):`
			`if entry.is_dir(follow_symlinks=True):`
			`if os.path.exists(os.path.join(entry.path, 'index.json')):`
fix: Add notice for issues with index detail 2020-07-22 17:08:32 -05:00			`try:`
			`link = parse_json_link_details(entry.path)`
			`except KeyError:`
			`link = None`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`if link:`
			`yield link`
split up utils into separate files 2019-04-30 23:13:04 -04:00


			`### Helpers`

			`class ExtendedEncoder(pyjson.JSONEncoder):`
			`"""`
			`Extended json serializer that supports serializing several model`
			`fields and objects`
			`"""`

			`def default(self, obj):`
			`cls_name = obj.__class__.__name__`

			`if hasattr(obj, '_asdict'):`
			`return obj._asdict()`

			`elif isinstance(obj, bytes):`
			`return obj.decode()`

			`elif isinstance(obj, datetime):`
			`return obj.isoformat()`

			`elif isinstance(obj, Exception):`
			`return '{}: {}'.format(obj.__class__.__name__, obj)`

			`elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):`
			`return tuple(obj)`

			`return pyjson.JSONEncoder.default(self, obj)`


			`@enforce_types`
			`def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:`
			`return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)`