Add cookies file to http request in download_url

This commit is contained in:
Ben Muthalaly 2024-02-27 15:30:31 -06:00
parent 31d05d8526
commit 68326a60ee

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import requests
import json as pyjson
import http.cookiejar
from typing import List, Optional, Any
from pathlib import Path
@ -164,13 +165,26 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
from .config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT
cookie_jar = http.cookiejar.MozillaCookieJar()
if COOKIES_FILE is not None:
cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True)
else:
cookie_jar = None
response = requests.get(
url,
headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY,
timeout=timeout,
cookies=cookie_jar,
)
content_type = response.headers.get('Content-Type', '')