move utils and vendored libs into subfolders

This commit is contained in:
Nick Sweeting 2020-12-06 02:01:18 +02:00
parent 8440858751
commit a0a79cead8
9 changed files with 413 additions and 52 deletions

View file

@ -14,7 +14,6 @@ from django import forms
from core.models import Snapshot, Tag
from core.forms import AddLinkForm, TagField
from core.utils import get_icons
from core.mixins import SearchResultsAdminMixin
from index.html import snapshot_icons

View file

@ -3,7 +3,7 @@ __package__ = 'archivebox.core'
from django import forms
from ..util import URL_REGEX
from .utils_taggit import edit_string_for_tags, parse_tags
from ..vendor.taggit_utils import edit_string_for_tags, parse_tags
CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),

View file

@ -4,34 +4,35 @@ __package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from datetime import datetime
from configparser import ConfigParser
from pathlib import Path
from pocket import Pocket
import requests
from ..vendor.pocket import Pocket
from ..index.schema import Link
from ..util import (
enforce_types,
)
from ..util import enforce_types
from ..system import atomic_write
from ..config import (
SOURCES_DIR
SOURCES_DIR,
POCKET_CONSUMER_KEY,
POCKET_ACCESS_TOKENS,
)
_COUNT_PER_PAGE = 500
_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
COUNT_PER_PAGE = 500
API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
# search for broken protocols that sometimes come from the Pocket API
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
def get_pocket_articles(api: Pocket, since=None, page=0):
body, headers = api.get(
state='archive',
sort='oldest',
since=since,
count=_COUNT_PER_PAGE,
offset=page * _COUNT_PER_PAGE,
count=COUNT_PER_PAGE,
offset=page * COUNT_PER_PAGE,
)
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
@ -39,7 +40,7 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
yield from articles
if returned_count == _COUNT_PER_PAGE:
if returned_count == COUNT_PER_PAGE:
yield from get_pocket_articles(api, since=since, page=page + 1)
else:
api.last_since = body['since']
@ -60,56 +61,53 @@ def link_from_article(article: dict, sources: list):
sources=sources
)
def write_since(username: str, since: str):
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
def write_since(username: str, since: str):
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, '')
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(_API_DB_PATH)
since_file.read(API_DB_PATH)
since_file[username] = {
'since': since
}
with open(_API_DB_PATH, 'w+') as new:
with open(API_DB_PATH, 'w+') as new:
since_file.write(new)
def read_since(username: str) -> Optional[str]:
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
def read_since(username: str) -> Optional[str]:
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, '')
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(_API_DB_PATH)
config_file.read(API_DB_PATH)
return config_file.get(username, 'since', fallback=None)
@enforce_types
def should_parse_as_pocket_api(text: str) -> bool:
return text.startswith('pocket://')
@enforce_types
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Pocket API"""
input_buffer.seek(0)
pattern = re.compile("^pocket:\/\/(\w+)")
pattern = re.compile(r"^pocket:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_pocket_api(line):
from ..config import (
POCKET_CONSUMER_KEY,
POCKET_ACCESS_TOKENS,
)
username = pattern.search(line).group(1)
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)
if should_parse_as_pocket_api(line):
username = pattern.search(line).group(1)
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)

View file

@ -1,11 +1,11 @@
__package__ = 'archivebox'
import re
from pathlib import Path
import requests
import json as pyjson
from typing import List, Optional, Any
from pathlib import Path
from inspect import signature
from functools import wraps
from hashlib import sha256
@ -13,10 +13,9 @@ from urllib.parse import urlparse, quote, unquote
from html import escape, unescape
from datetime import datetime
from dateparser import parse as dateparser
import requests
from requests.exceptions import RequestException, ReadTimeout
from .base32_crockford import encode as base32_encode # type: ignore
from .vendor.base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
try:

0
archivebox/vendor/__init__.py vendored Normal file
View file

368
archivebox/vendor/pocket.py vendored Normal file
View file

@ -0,0 +1,368 @@
# https://github.com/tapanpandita/pocket/blob/master/pocket.py
import requests
import json
from functools import wraps
class PocketException(Exception):
'''
Base class for all pocket exceptions
http://getpocket.com/developer/docs/errors
'''
pass
class InvalidQueryException(PocketException):
pass
class AuthException(PocketException):
pass
class RateLimitException(PocketException):
'''
http://getpocket.com/developer/docs/rate-limits
'''
pass
class ServerMaintenanceException(PocketException):
pass
EXCEPTIONS = {
400: InvalidQueryException,
401: AuthException,
403: RateLimitException,
503: ServerMaintenanceException,
}
def method_wrapper(fn):
@wraps(fn)
def wrapped(self, *args, **kwargs):
arg_names = list(fn.__code__.co_varnames)
arg_names.remove('self')
kwargs.update(dict(zip(arg_names, args)))
url = self.api_endpoints[fn.__name__]
payload = dict([
(k, v) for k, v in kwargs.items()
if v is not None
])
payload.update(self.get_payload())
return self.make_request(url, payload)
return wrapped
def bulk_wrapper(fn):
@wraps(fn)
def wrapped(self, *args, **kwargs):
arg_names = list(fn.__code__.co_varnames)
arg_names.remove('self')
kwargs.update(dict(zip(arg_names, args)))
wait = kwargs.get('wait', True)
query = dict(
[(k, v) for k, v in kwargs.items() if v is not None]
)
# TODO: Fix this hack
query['action'] = 'add' if fn.__name__ == 'bulk_add' else fn.__name__
if wait:
self.add_bulk_query(query)
return self
else:
url = self.api_endpoints['send']
payload = {
'actions': [query],
}
payload.update(self.get_payload())
return self.make_request(
url,
json.dumps(payload),
headers={'content-type': 'application/json'},
)
return wrapped
class Pocket(object):
'''
This class implements a basic python wrapper around the pocket api. For a
detailed documentation of the methods and what they do please refer the
official pocket api documentation at
http://getpocket.com/developer/docs/overview
'''
api_endpoints = dict(
(method, 'https://getpocket.com/v3/%s' % method)
for method in "add,send,get".split(",")
)
statuses = {
200: 'Request was successful',
400: 'Invalid request, please make sure you follow the '
'documentation for proper syntax',
401: 'Problem authenticating the user',
403: 'User was authenticated, but access denied due to lack of '
'permission or rate limiting',
503: 'Pocket\'s sync server is down for scheduled maintenance.',
}
def __init__(self, consumer_key, access_token):
self.consumer_key = consumer_key
self.access_token = access_token
self._bulk_query = []
self._payload = {
'consumer_key': self.consumer_key,
'access_token': self.access_token,
}
def get_payload(self):
return self._payload
def add_bulk_query(self, query):
self._bulk_query.append(query)
@staticmethod
def _post_request(url, payload, headers):
r = requests.post(url, data=payload, headers=headers)
return r
@classmethod
def _make_request(cls, url, payload, headers=None):
r = cls._post_request(url, payload, headers)
if r.status_code > 399:
error_msg = cls.statuses.get(r.status_code)
extra_info = r.headers.get('X-Error')
raise EXCEPTIONS.get(r.status_code, PocketException)(
'%s. %s' % (error_msg, extra_info)
)
return r.json() or r.text, r.headers
@classmethod
def make_request(cls, url, payload, headers=None):
return cls._make_request(url, payload, headers)
@method_wrapper
def add(self, url, title=None, tags=None, tweet_id=None):
'''
This method allows you to add a page to a user's list.
In order to use the /v3/add endpoint, your consumer key must have the
"Add" permission.
http://getpocket.com/developer/docs/v3/add
'''
@method_wrapper
def get(
self, state=None, favorite=None, tag=None, contentType=None,
sort=None, detailType=None, search=None, domain=None, since=None,
count=None, offset=None
):
'''
This method allows you to retrieve a user's list. It supports
retrieving items changed since a specific time to allow for syncing.
http://getpocket.com/developer/docs/v3/retrieve
'''
@method_wrapper
def send(self, actions):
'''
This method allows you to make changes to a user's list. It supports
adding new pages, marking pages as read, changing titles, or updating
tags. Multiple changes to items can be made in one request.
http://getpocket.com/developer/docs/v3/modify
'''
@bulk_wrapper
def bulk_add(
self, item_id, ref_id=None, tags=None, time=None, title=None,
url=None, wait=True
):
'''
Add a new item to the user's list
http://getpocket.com/developer/docs/v3/modify#action_add
'''
@bulk_wrapper
def archive(self, item_id, time=None, wait=True):
'''
Move an item to the user's archive
http://getpocket.com/developer/docs/v3/modify#action_archive
'''
@bulk_wrapper
def readd(self, item_id, time=None, wait=True):
'''
Re-add (unarchive) an item to the user's list
http://getpocket.com/developer/docs/v3/modify#action_readd
'''
@bulk_wrapper
def favorite(self, item_id, time=None, wait=True):
'''
Mark an item as a favorite
http://getpocket.com/developer/docs/v3/modify#action_favorite
'''
@bulk_wrapper
def unfavorite(self, item_id, time=None, wait=True):
'''
Remove an item from the user's favorites
http://getpocket.com/developer/docs/v3/modify#action_unfavorite
'''
@bulk_wrapper
def delete(self, item_id, time=None, wait=True):
'''
Permanently remove an item from the user's account
http://getpocket.com/developer/docs/v3/modify#action_delete
'''
@bulk_wrapper
def tags_add(self, item_id, tags, time=None, wait=True):
'''
Add one or more tags to an item
http://getpocket.com/developer/docs/v3/modify#action_tags_add
'''
@bulk_wrapper
def tags_remove(self, item_id, tags, time=None, wait=True):
'''
Remove one or more tags from an item
http://getpocket.com/developer/docs/v3/modify#action_tags_remove
'''
@bulk_wrapper
def tags_replace(self, item_id, tags, time=None, wait=True):
'''
Replace all of the tags for an item with one or more provided tags
http://getpocket.com/developer/docs/v3/modify#action_tags_replace
'''
@bulk_wrapper
def tags_clear(self, item_id, time=None, wait=True):
'''
Remove all tags from an item.
http://getpocket.com/developer/docs/v3/modify#action_tags_clear
'''
@bulk_wrapper
def tag_rename(self, item_id, old_tag, new_tag, time=None, wait=True):
'''
Rename a tag. This affects all items with this tag.
http://getpocket.com/developer/docs/v3/modify#action_tag_rename
'''
def commit(self):
'''
This method executes the bulk query, flushes stored queries and
returns the response
'''
url = self.api_endpoints['send']
payload = {
'actions': self._bulk_query,
}
payload.update(self._payload)
self._bulk_query = []
return self._make_request(
url,
json.dumps(payload),
headers={'content-type': 'application/json'},
)
@classmethod
def get_request_token(
cls, consumer_key, redirect_uri='http://example.com/', state=None
):
'''
Returns the request token that can be used to fetch the access token
'''
headers = {
'X-Accept': 'application/json',
}
url = 'https://getpocket.com/v3/oauth/request'
payload = {
'consumer_key': consumer_key,
'redirect_uri': redirect_uri,
}
if state:
payload['state'] = state
return cls._make_request(url, payload, headers)[0]['code']
@classmethod
def get_credentials(cls, consumer_key, code):
'''
Fetches access token from using the request token and consumer key
'''
headers = {
'X-Accept': 'application/json',
}
url = 'https://getpocket.com/v3/oauth/authorize'
payload = {
'consumer_key': consumer_key,
'code': code,
}
return cls._make_request(url, payload, headers)[0]
@classmethod
def get_access_token(cls, consumer_key, code):
return cls.get_credentials(consumer_key, code)['access_token']
@classmethod
def get_auth_url(cls, code, redirect_uri='http://example.com'):
auth_url = ('https://getpocket.com/auth/authorize'
'?request_token=%s&redirect_uri=%s' % (code, redirect_uri))
return auth_url
@classmethod
def auth(
cls, consumer_key, redirect_uri='http://example.com/', state=None,
):
'''
This is a test method for verifying if oauth worked
http://getpocket.com/developer/docs/authentication
'''
code = cls.get_request_token(consumer_key, redirect_uri, state)
auth_url = 'https://getpocket.com/auth/authorize?request_token='\
'%s&redirect_uri=%s' % (code, redirect_uri)
raw_input(
'Please open %s in your browser to authorize the app and '
'press enter:' % auth_url
)
return cls.get_access_token(consumer_key, code)

View file

@ -48,6 +48,11 @@ setuptools.setup(
"wheel",
],
install_requires=[
# only add things here that have corresponding apt python3-packages available
# anything added here also needs to be added to our package dependencies in
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# if there is no apt python3-package equivalent, then vendor it instead in
# ./archivebox/vendor/
"requests==2.24.0",
"atomicwrites==1.4.0",
"mypy-extensions==0.4.3",
@ -59,12 +64,6 @@ setuptools.setup(
"python-crontab==2.5.1",
"croniter==0.3.34",
"w3lib==1.22.0",
"pocket==0.3.6",
# Some/all of these will likely be added in the future:
# wpull
# pywb
# pyppeteer
# archivenow
],
extras_require={
'dev': [
@ -81,8 +80,6 @@ setuptools.setup(
"bottle",
"stdeb",
],
# 'redis': ['redis', 'django-redis'],
# 'pywb': ['pywb', 'redis'],
},
packages=[PKG_NAME],
include_package_data=True, # see MANIFEST.in