ArchiveBox/archivebox/parsers/url_list.py

__package__ = 'archivebox.parsers'
__description__ = 'URL list'

import re

from typing import IO, Iterable
from datetime import datetime, timezone

from ..index.schema import Link
from archivebox.misc.util import (
    enforce_types,
    URL_REGEX,
)


@enforce_types
def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse raw URLs from each line in a text file"""

    text_file.seek(0)
    for line in text_file.readlines():
        url = line.strip()
        if (not url) or not re.findall(URL_REGEX, url):
            continue

        yield Link(
            url=url,
            timestamp=str(datetime.now(timezone.utc).timestamp()),
            title=None,
            tags=None,
            sources=[text_file.name],
        )


KEY = 'url_list'
NAME = 'URL List'
PARSER = parse_url_list
add command: --parser option 2021-03-20 16:38:00 +00:00			`__package__ = 'archivebox.parsers'`
			`__description__ = 'URL list'`

only add url-list lines that are real urls 2021-04-01 07:31:55 +00:00			`import re`

add command: --parser option 2021-03-20 16:38:00 +00:00			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
add command: --parser option 2021-03-20 16:38:00 +00:00
			`from ..index.schema import Link`
move util.py into misc folder 2024-10-01 00:25:15 +00:00			`from archivebox.misc.util import (`
only add url-list lines that are real urls 2021-04-01 07:31:55 +00:00			`enforce_types,`
			`URL_REGEX,`
add command: --parser option 2021-03-20 16:38:00 +00:00			`)`


			`@enforce_types`
			`def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:`
			`"""Parse raw URLs from each line in a text file"""`

			`text_file.seek(0)`
			`for line in text_file.readlines():`
			`url = line.strip()`
only add url-list lines that are real urls 2021-04-01 07:31:55 +00:00			`if (not url) or not re.findall(URL_REGEX, url):`
add command: --parser option 2021-03-20 16:38:00 +00:00			`continue`

			`yield Link(`
			`url=url,`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
add command: --parser option 2021-03-20 16:38:00 +00:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 05:05:49 +00:00

			`KEY = 'url_list'`
			`NAME = 'URL List'`
			`PARSER = parse_url_list`