Plex-Meta-Manager/modules/letterboxd.py

131 lines
7.1 KiB
Python
Raw Normal View History

2022-03-31 06:23:48 +00:00
import re, time
2021-03-04 20:05:51 +00:00
from modules import util
from modules.util import Failed
logger = util.logger
2021-03-04 20:05:51 +00:00
2021-03-30 05:50:53 +00:00
builders = ["letterboxd_list", "letterboxd_list_details"]
2021-07-14 14:47:20 +00:00
base_url = "https://letterboxd.com"
2021-03-30 05:50:53 +00:00
2021-06-14 15:24:11 +00:00
class Letterboxd:
2024-05-28 20:22:51 +00:00
def __init__(self, requests, cache):
self.requests = requests
self.cache = cache
2021-03-08 21:54:19 +00:00
2022-05-18 17:07:15 +00:00
def _parse_page(self, list_url, language):
2022-08-31 13:49:52 +00:00
if "ajax" not in list_url:
list_url = list_url.replace("https://letterboxd.com/films", "https://letterboxd.com/films/ajax")
2022-09-08 16:07:15 +00:00
logger.trace(f"URL: {list_url}")
2024-05-28 20:22:51 +00:00
response = self.requests.get_html(list_url, language=language)
2022-08-31 13:49:52 +00:00
letterboxd_ids = response.xpath("//li[contains(@class, 'poster-container') or contains(@class, 'film-detail')]/div/@data-film-id")
2021-03-12 19:49:23 +00:00
items = []
for letterboxd_id in letterboxd_ids:
2023-08-20 15:31:51 +00:00
slugs = response.xpath(f"//div[@data-film-id='{letterboxd_id}']/@data-target-link")
2022-08-31 13:49:52 +00:00
comments = response.xpath(f"//div[@data-film-id='{letterboxd_id}']/parent::li/div[@class='film-detail-content']/div/p/text()")
ratings = response.xpath(f"//div[@data-film-id='{letterboxd_id}']/parent::li/div[@class='film-detail-content']//span[contains(@class, 'rating')]/@class")
years = response.xpath(f"//div[@data-film-id='{letterboxd_id}']/parent::li/div[@class='film-detail-content']/h2/small/a/text()")
2022-03-31 06:23:48 +00:00
rating = None
if ratings:
match = re.search("rated-(\\d+)", ratings[0])
if match:
rating = int(match.group(1))
2022-08-31 13:49:52 +00:00
items.append((letterboxd_id, slugs[0], int(years[0]) if years else None, comments[0] if comments else None, rating))
2021-03-04 20:05:51 +00:00
next_url = response.xpath("//a[@class='next']/@href")
2022-05-18 17:07:15 +00:00
return items, next_url
def _parse_list(self, list_url, limit, language):
items, next_url = self._parse_page(list_url, language)
while len(next_url) > 0:
2021-07-14 14:47:20 +00:00
time.sleep(2)
2022-05-18 17:07:15 +00:00
new_items, next_url = self._parse_page(f"{base_url}{next_url[0]}", language)
items.extend(new_items)
if limit and len(items) >= limit:
return items[:limit]
2021-03-12 19:49:23 +00:00
return items
2021-03-04 20:05:51 +00:00
2021-05-07 19:53:54 +00:00
def _tmdb(self, letterboxd_url, language):
2022-09-07 19:32:52 +00:00
logger.trace(f"URL: {letterboxd_url}")
2024-05-28 20:22:51 +00:00
response = self.requests.get_html(letterboxd_url, language=language)
2021-03-12 19:49:23 +00:00
ids = response.xpath("//a[@data-track-action='TMDb']/@href")
2021-03-11 21:51:02 +00:00
if len(ids) > 0 and ids[0]:
2021-03-12 19:49:23 +00:00
if "themoviedb.org/movie" in ids[0]:
2022-03-06 23:51:12 +00:00
return util.regex_first_int(ids[0], "TMDb Movie ID")
2021-03-12 19:49:23 +00:00
raise Failed(f"Letterboxd Error: TMDb Movie ID not found in {ids[0]}")
2021-03-11 21:53:43 +00:00
raise Failed(f"Letterboxd Error: TMDb Movie ID not found at {letterboxd_url}")
2021-03-04 20:05:51 +00:00
2021-05-07 19:53:54 +00:00
def get_list_description(self, list_url, language):
2022-09-07 19:32:52 +00:00
logger.trace(f"URL: {list_url}")
2024-05-28 20:22:51 +00:00
response = self.requests.get_html(list_url, language=language)
2021-07-14 14:47:20 +00:00
descriptions = response.xpath("//meta[@property='og:description']/@content")
if len(descriptions) > 0 and len(descriptions[0]) > 0 and "About this list: " in descriptions[0]:
return str(descriptions[0]).split("About this list: ")[1]
return None
2021-05-07 19:53:54 +00:00
2022-03-31 06:23:48 +00:00
def validate_letterboxd_lists(self, err_type, letterboxd_lists, language):
2021-07-21 17:40:05 +00:00
valid_lists = []
2022-03-31 06:23:48 +00:00
for letterboxd_dict in util.get_list(letterboxd_lists, split=False):
if not isinstance(letterboxd_dict, dict):
letterboxd_dict = {"url": letterboxd_dict}
dict_methods = {dm.lower(): dm for dm in letterboxd_dict}
final = {
"url": util.parse(err_type, "url", letterboxd_dict, methods=dict_methods, parent="letterboxd_list").strip(),
2022-05-18 17:07:15 +00:00
"limit": util.parse(err_type, "limit", letterboxd_dict, methods=dict_methods, datatype="int", parent="letterboxd_list", default=0) if "limit" in dict_methods else 0,
2022-03-31 06:23:48 +00:00
"note": util.parse(err_type, "note", letterboxd_dict, methods=dict_methods, parent="letterboxd_list") if "note" in dict_methods else None,
"rating": util.parse(err_type, "rating", letterboxd_dict, methods=dict_methods, datatype="int", parent="letterboxd_list", maximum=100, range_split="-") if "rating" in dict_methods else None,
"year": util.parse(err_type, "year", letterboxd_dict, methods=dict_methods, datatype="int", parent="letterboxd_list", minimum=1000, maximum=3000, range_split="-") if "year" in dict_methods else None
}
if not final["url"].startswith(base_url):
raise Failed(f"{err_type} Error: {final['url']} must begin with: {base_url}")
2022-05-18 17:07:15 +00:00
elif not self._parse_page(final["url"], language)[0]:
2022-03-31 06:23:48 +00:00
raise Failed(f"{err_type} Error: {final['url']} failed to parse")
valid_lists.append(final)
2021-07-21 17:40:05 +00:00
return valid_lists
2021-08-07 06:01:21 +00:00
def get_tmdb_ids(self, method, data, language):
2021-08-01 04:35:42 +00:00
if method == "letterboxd_list":
logger.info(f"Processing Letterboxd List: {data}")
2022-05-18 17:07:15 +00:00
items = self._parse_list(data["url"], data["limit"], language)
2021-08-01 04:35:42 +00:00
total_items = len(items)
if total_items > 0:
2021-08-07 06:01:21 +00:00
ids = []
2022-03-31 06:23:48 +00:00
filtered_ids = []
2021-08-01 04:35:42 +00:00
for i, item in enumerate(items, 1):
2022-03-31 06:23:48 +00:00
letterboxd_id, slug, year, note, rating = item
filtered = False
if data["year"]:
start_year, end_year = data["year"].split("-")
if not year or int(end_year) < year or year < int(start_year):
filtered = True
if data["rating"]:
start_rating, end_rating = data["rating"].split("-")
if not rating or int(end_rating) < rating or rating < int(start_rating):
filtered = True
if data["note"]:
if not note or data["note"] not in note:
filtered = True
if filtered:
filtered_ids.append(slug)
continue
logger.ghost(f"Finding TMDb ID {i}/{total_items}")
2021-08-01 04:35:42 +00:00
tmdb_id = None
expired = None
2024-05-28 20:22:51 +00:00
if self.cache:
tmdb_id, expired = self.cache.query_letterboxd_map(letterboxd_id)
2021-08-01 04:35:42 +00:00
if not tmdb_id or expired is not False:
try:
tmdb_id = self._tmdb(f"{base_url}{slug}", language)
except Failed as e:
logger.error(e)
continue
2024-05-28 20:22:51 +00:00
if self.cache:
self.cache.update_letterboxd_map(expired, letterboxd_id, tmdb_id)
2021-08-07 06:01:21 +00:00
ids.append((tmdb_id, "tmdb"))
logger.info(f"Processed {total_items} TMDb IDs")
2022-03-31 06:23:48 +00:00
if filtered_ids:
logger.info(f"Filtered: {filtered_ids}")
2021-08-07 06:01:21 +00:00
return ids
2021-08-01 04:35:42 +00:00
else:
2021-08-07 06:01:21 +00:00
raise Failed(f"Letterboxd Error: No List Items found in {data}")
2021-05-09 05:38:41 +00:00
else:
2021-08-01 04:35:42 +00:00
raise Failed(f"Letterboxd Error: Method {method} not supported")