Merge pull request #1168 from mAAdhaTTah/add-readwise-reader

This commit is contained in:
Nick Sweeting 2023-09-03 21:24:49 -07:00 committed by GitHub
commit 16d278fbdb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 127 additions and 0 deletions

View file

@ -223,6 +223,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
},
}

View file

@ -34,6 +34,7 @@ from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
from . import pocket_api
from . import readwise_reader_api
from . import wallabag_atom
from . import pocket_html
from . import pinboard_rss
@ -51,6 +52,7 @@ from . import url_list
PARSERS = {
# Specialized parsers
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),

View file

@ -0,0 +1,123 @@
__package__ = "archivebox.parsers"
import re
import requests
from datetime import datetime
from typing import IO, Iterable, Optional
from configparser import ConfigParser
from pathlib import Path
from ..index.schema import Link
from ..util import enforce_types
from ..system import atomic_write
from ..config import (
SOURCES_DIR,
READWISE_READER_TOKENS,
)
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
class ReadwiseReaderAPI:
cursor: Optional[str]
def __init__(self, api_token, cursor=None) -> None:
self.api_token = api_token
self.cursor = cursor
def get_archive(self):
response = requests.get(
url="https://readwise.io/api/v3/list/",
headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
params={
"location": "archive",
"pageCursor": self.cursor,
}
)
response.raise_for_status()
return response
def get_readwise_reader_articles(api: ReadwiseReaderAPI):
response = api.get_archive()
body = response.json()
articles = body["results"]
yield from articles
if body['nextPageCursor']:
api.cursor = body["nextPageCursor"]
yield from get_readwise_reader_articles(api)
def link_from_article(article: dict, sources: list):
url: str = article['source_url']
title = article["title"] or url
timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
return Link(
url=url,
timestamp=str(timestamp),
title=title,
tags="",
sources=sources,
)
def write_cursor(username: str, since: str):
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, "")
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(API_DB_PATH)
since_file[username] = {"since": since}
with open(API_DB_PATH, "w+") as new:
since_file.write(new)
def read_cursor(username: str) -> Optional[str]:
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, "")
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(API_DB_PATH)
return config_file.get(username, "since", fallback=None)
@enforce_types
def should_parse_as_readwise_reader_api(text: str) -> bool:
return text.startswith("readwise-reader://")
@enforce_types
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Readwise Reader API"""
input_buffer.seek(0)
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_readwise_reader_api(line):
username = pattern.search(line).group(1)
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
for article in get_readwise_reader_articles(api):
yield link_from_article(article, sources=[line])
if api.cursor:
write_cursor(username, api.cursor)
KEY = "readwise_reader_api"
NAME = "Readwise Reader API"
PARSER = parse_readwise_reader_api_export