"""
Everything related to parsing links from bookmark services.
For a list of supported services, see the README.md.
For examples of supported files see examples/.
Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'domain': 'example.com',
'base_url': 'example.com/example/',
'timestamp': '15442123124234',
'tags': 'abc,def',
'title': 'Example.com Page Title',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
}
"""
import re
import json
import xml.etree.ElementTree as etree
from datetime import datetime
from util import (
domain,
base_url,
str_between,
get_link_type,
)
def get_parsers(file):
"""return all parsers that work on a given file, defaults to all of them"""
return {
'pocket': parse_pocket_export,
'pinboard': parse_json_export,
'bookmarks': parse_bookmarks_export,
'rss': parse_rss_export,
'pinboard_rss': parse_pinboard_rss_feed,
'medium_rss': parse_medium_rss_feed,
}
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
links = []
with open(path, 'r', encoding='utf-8') as file:
for parser_func in get_parsers(file).values():
# otherwise try all parsers until one works
try:
links += list(parser_func(file))
if links:
break
except (ValueError, TypeError, IndexError, AttributeError, etree.ParseError):
# parser not supported on this file
pass
return links
def parse_pocket_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*