Add cookies file to http request in download_url

This commit is contained in:
Ben Muthalaly 2024-02-27 15:30:31 -06:00
parent 31d05d8526
commit 68326a60ee

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re import re
import requests import requests
import json as pyjson import json as pyjson
import http.cookiejar
from typing import List, Optional, Any from typing import List, Optional, Any
from pathlib import Path from pathlib import Path
@ -164,13 +165,26 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types @enforce_types
def download_url(url: str, timeout: int=None) -> str: def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text""" """Download the contents of a remote url and return the text"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT from .config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT timeout = timeout or TIMEOUT
cookie_jar = http.cookiejar.MozillaCookieJar()
if COOKIES_FILE is not None:
cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True)
else:
cookie_jar = None
response = requests.get( response = requests.get(
url, url,
headers={'User-Agent': WGET_USER_AGENT}, headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY, verify=CHECK_SSL_VALIDITY,
timeout=timeout, timeout=timeout,
cookies=cookie_jar,
) )
content_type = response.headers.get('Content-Type', '') content_type = response.headers.get('Content-Type', '')