ArchiveBox/archivebox/parsers/generic_json.py

__package__ = 'archivebox.parsers'

import json

from typing import IO, Iterable
from datetime import datetime, timezone

from ..index.schema import Link
from ..util import (
    htmldecode,
    enforce_types,
)


@enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""

    json_file.seek(0)

    # sometimes the first line is a comment or filepath, so we get everything after the first {
    json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
    links = json.loads(json_file_json_str)
    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')

    for link in links:
        # example line
        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
        if link:
            # Parse URL
            url = link.get('href') or link.get('url') or link.get('URL')
            if not url:
                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')

            # Parse the timestamp
            ts_str = str(datetime.now(timezone.utc).timestamp())
            if link.get('timestamp'):
                # chrome/ff histories use a very precise timestamp
                ts_str = str(link['timestamp'] / 10000000)  
            elif link.get('time'):
                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
            elif link.get('created_at'):
                ts_str = str(json_date(link['created_at']).timestamp())
            elif link.get('created'):
                ts_str = str(json_date(link['created']).timestamp())
            elif link.get('date'):
                ts_str = str(json_date(link['date']).timestamp())
            elif link.get('bookmarked'):
                ts_str = str(json_date(link['bookmarked']).timestamp())
            elif link.get('saved'):
                ts_str = str(json_date(link['saved']).timestamp())
            
            # Parse the title
            title = None
            if link.get('title'):
                title = link['title'].strip()
            elif link.get('description'):
                title = link['description'].replace(' — Readability', '').strip()
            elif link.get('name'):
                title = link['name'].strip()

            yield Link(
                url=htmldecode(url),
                timestamp=ts_str,
                title=htmldecode(title) or None,
                tags=htmldecode(link.get('tags')) or '',
                sources=[json_file.name],
            )


KEY = 'json'
NAME = 'Generic JSON'
PARSER = parse_generic_json_export
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`__package__ = 'archivebox.parsers'`

			`import json`

			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`from ..index.schema import Link`
			`from ..util import (`
			`htmldecode,`
			`enforce_types,`
			`)`


			`@enforce_types`
make all parsers accept arbitrary meta kwargs 2020-08-18 12:27:47 +00:00			`def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""`

			`json_file.seek(0)`
only start parsing json after the first open brace 2023-09-04 04:40:12 +00:00
			`# sometimes the first line is a comment or filepath, so we get everything after the first {`
			`json_file_json_str = '{' + json_file.read().split('{', 1)[-1]`
			`links = json.loads(json_file_json_str)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')`

			`for link in links:`
			`# example line`
			`# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]`
			`if link:`
			`# Parse URL`
			`url = link.get('href') or link.get('url') or link.get('URL')`
			`if not url:`
			`raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')`

			`# Parse the timestamp`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`ts_str = str(datetime.now(timezone.utc).timestamp())`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`if link.get('timestamp'):`
			`# chrome/ff histories use a very precise timestamp`
			`ts_str = str(link['timestamp'] / 10000000)`
			`elif link.get('time'):`
			`ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())`
			`elif link.get('created_at'):`
			`ts_str = str(json_date(link['created_at']).timestamp())`
			`elif link.get('created'):`
			`ts_str = str(json_date(link['created']).timestamp())`
			`elif link.get('date'):`
			`ts_str = str(json_date(link['date']).timestamp())`
			`elif link.get('bookmarked'):`
			`ts_str = str(json_date(link['bookmarked']).timestamp())`
			`elif link.get('saved'):`
			`ts_str = str(json_date(link['saved']).timestamp())`

			`# Parse the title`
			`title = None`
			`if link.get('title'):`
			`title = link['title'].strip()`
			`elif link.get('description'):`
			`title = link['description'].replace(' — Readability', '').strip()`
			`elif link.get('name'):`
			`title = link['name'].strip()`

			`yield Link(`
			`url=htmldecode(url),`
			`timestamp=ts_str,`
			`title=htmldecode(title) or None,`
			`tags=htmldecode(link.get('tags')) or '',`
			`sources=[json_file.name],`
			`)`
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 05:05:49 +00:00

			`KEY = 'json'`
			`NAME = 'Generic JSON'`
			`PARSER = parse_generic_json_export`