mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-29 23:50:22 +00:00
Add cookies file to http request in download_url
This commit is contained in:
parent
31d05d8526
commit
68326a60ee
1 changed files with 15 additions and 1 deletions
|
@ -3,6 +3,7 @@ __package__ = 'archivebox'
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
import json as pyjson
|
import json as pyjson
|
||||||
|
import http.cookiejar
|
||||||
|
|
||||||
from typing import List, Optional, Any
|
from typing import List, Optional, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -164,13 +165,26 @@ def parse_date(date: Any) -> Optional[datetime]:
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def download_url(url: str, timeout: int=None) -> str:
|
def download_url(url: str, timeout: int=None) -> str:
|
||||||
"""Download the contents of a remote url and return the text"""
|
"""Download the contents of a remote url and return the text"""
|
||||||
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
|
from .config import (
|
||||||
|
TIMEOUT,
|
||||||
|
CHECK_SSL_VALIDITY,
|
||||||
|
WGET_USER_AGENT,
|
||||||
|
COOKIES_FILE,
|
||||||
|
)
|
||||||
timeout = timeout or TIMEOUT
|
timeout = timeout or TIMEOUT
|
||||||
|
|
||||||
|
cookie_jar = http.cookiejar.MozillaCookieJar()
|
||||||
|
if COOKIES_FILE is not None:
|
||||||
|
cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True)
|
||||||
|
else:
|
||||||
|
cookie_jar = None
|
||||||
|
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url,
|
url,
|
||||||
headers={'User-Agent': WGET_USER_AGENT},
|
headers={'User-Agent': WGET_USER_AGENT},
|
||||||
verify=CHECK_SSL_VALIDITY,
|
verify=CHECK_SSL_VALIDITY,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
|
cookies=cookie_jar,
|
||||||
)
|
)
|
||||||
|
|
||||||
content_type = response.headers.get('Content-Type', '')
|
content_type = response.headers.get('Content-Type', '')
|
||||||
|
|
Loading…
Reference in a new issue