thelounge/server/plugins/irc-events/link.ts

import * as cheerio from "cheerio";
import got from "got";
import {URL} from "url";
import mime from "mime-types";

import log from "../../log";
import Config from "../../config";
import {findLinksWithSchema} from "../../../shared/linkify";
import {LinkPreview} from "../../../shared/types/msg";
import storage from "../storage";
import Client from "../../client";
import Chan from "../../models/chan";
import Msg from "../../models/msg";

type FetchRequest = {
	data: Buffer;
	type: string;
	size: number;
};
const currentFetchPromises = new Map<string, Promise<FetchRequest>>();
const imageTypeRegex = /^image\/.+/;
const mediaTypeRegex = /^(audio|video)\/.+/;

export default function (client: Client, chan: Chan, msg: Msg, cleanText: string) {
	if (!Config.values.prefetch) {
		return;
	}

	msg.previews = findLinksWithSchema(cleanText).reduce((cleanLinks: LinkPreview[], link) => {
		const url = normalizeURL(link.link);

		// If the URL is invalid and cannot be normalized, don't fetch it
		if (!url) {
			return cleanLinks;
		}

		// If there are too many urls in this message, only fetch first X valid links
		if (cleanLinks.length > 4) {
			return cleanLinks;
		}

		// Do not fetch duplicate links twice
		if (cleanLinks.some((l) => l.link === link.link)) {
			return cleanLinks;
		}

		const preview: LinkPreview = {
			type: "loading",
			head: "",
			body: "",
			thumb: "",
			size: -1,
			link: link.link, // Send original matched link to the client
			shown: null,
		};

		cleanLinks.push(preview);

		fetch(url, {
			accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			language: client.config.browser?.language || "",
		})
			.then((res) => {
				parse(msg, chan, preview, res, client);
			})
			.catch((err) => {
				preview.type = "error";
				preview.error = "message";
				preview.message = err.message;
				emitPreview(client, chan, msg, preview);
			});

		return cleanLinks;
	}, []);
}

function parseHtml(preview, res, client: Client) {
	// TODO:
	// eslint-disable-next-line @typescript-eslint/no-misused-promises
	return new Promise((resolve: (preview: FetchRequest | null) => void) => {
		const $ = cheerio.load(res.data);

		return parseHtmlMedia($, preview, client)
			.then((newRes) => resolve(newRes))
			.catch(() => {
				preview.type = "link";
				preview.head =
					$('meta[property="og:title"]').attr("content") ||
					$("head > title, title").first().text() ||
					"";
				preview.body =
					$('meta[property="og:description"]').attr("content") ||
					$('meta[name="description"]').attr("content") ||
					"";

				if (preview.head.length) {
					preview.head = preview.head.substr(0, 100);
				}

				if (preview.body.length) {
					preview.body = preview.body.substr(0, 300);
				}

				if (!Config.values.prefetchStorage && Config.values.disableMediaPreview) {
					resolve(res);
					return;
				}

				let thumb =
					$('meta[property="og:image"]').attr("content") ||
					$('meta[name="twitter:image:src"]').attr("content") ||
					$('link[rel="image_src"]').attr("href") ||
					"";

				// Make sure thumbnail is a valid and absolute url
				if (thumb.length) {
					thumb = normalizeURL(thumb, preview.link) || "";
				}

				// Verify that thumbnail pic exists and is under allowed size
				if (thumb.length) {
					fetch(thumb, {language: client.config.browser?.language || ""})
						.then((resThumb) => {
							if (
								resThumb !== null &&
								imageTypeRegex.test(resThumb.type) &&
								resThumb.size <= Config.values.prefetchMaxImageSize * 1024
							) {
								preview.thumbActualUrl = thumb;
							}

							resolve(resThumb);
						})
						.catch(() => resolve(null));
				} else {
					resolve(res);
				}
			});
	});
}

// TODO: type $
function parseHtmlMedia($: any, preview, client: Client): Promise<FetchRequest> {
	return new Promise((resolve, reject) => {
		if (Config.values.disableMediaPreview) {
			reject();
			return;
		}

		let foundMedia = false;
		const openGraphType = $('meta[property="og:type"]').attr("content");

		// Certain news websites may include video and audio tags,
		// despite actually being an article (as indicated by og:type).
		// If there is og:type tag, we will only select video or audio if it matches
		if (
			openGraphType &&
			!openGraphType.startsWith("video") &&
			!openGraphType.startsWith("music")
		) {
			reject();
			return;
		}

		["video", "audio"].forEach((type) => {
			if (foundMedia) {
				return;
			}

			$(`meta[property="og:${type}:type"]`).each(function (this: cheerio.Element, i: number) {
				const mimeType = $(this).attr("content");

				if (!mimeType) {
					return;
				}

				if (mediaTypeRegex.test(mimeType)) {
					// If we match a clean video or audio tag, parse that as a preview instead
					let mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");

					if (!mediaUrl) {
						return;
					}

					// Make sure media is a valid url
					mediaUrl = normalizeURL(mediaUrl, preview.link, true);

					// Make sure media is a valid url
					if (!mediaUrl) {
						return;
					}

					foundMedia = true;

					fetch(mediaUrl, {
						accept:
							type === "video"
								? "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5"
								: "audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5",
						language: client.config.browser?.language || "",
					})
						.then((resMedia) => {
							if (resMedia === null || !mediaTypeRegex.test(resMedia.type)) {
								return reject();
							}

							preview.type = type;
							preview.media = mediaUrl;
							preview.mediaType = resMedia.type;

							resolve(resMedia);
						})
						.catch(reject);

					return false;
				}
			});
		});

		if (!foundMedia) {
			reject();
		}
	});
}

function parse(msg: Msg, chan: Chan, preview: LinkPreview, res: FetchRequest, client: Client) {
	let promise: Promise<FetchRequest | null> | null = null;

	preview.size = res.size;

	switch (res.type) {
		case "text/html":
			preview.size = -1;
			promise = parseHtml(preview, res, client);
			break;

		case "text/plain":
			preview.type = "link";
			preview.body = res.data.toString().substr(0, 300);
			break;

		case "image/png":
		case "image/gif":
		case "image/jpg":
		case "image/jpeg":
		case "image/jxl":
		case "image/webp":
		case "image/avif":
			if (!Config.values.prefetchStorage && Config.values.disableMediaPreview) {
				return removePreview(msg, preview);
			}

			if (res.size > Config.values.prefetchMaxImageSize * 1024) {
				preview.type = "error";
				preview.error = "image-too-big";
				preview.maxSize = Config.values.prefetchMaxImageSize * 1024;
			} else {
				preview.type = "image";
				preview.thumbActualUrl = preview.link;
			}

			break;

		case "audio/midi":
		case "audio/mpeg":
		case "audio/mpeg3":
		case "audio/ogg":
		case "audio/wav":
		case "audio/x-wav":
		case "audio/x-mid":
		case "audio/x-midi":
		case "audio/x-mpeg":
		case "audio/x-mpeg-3":
		case "audio/flac":
		case "audio/x-flac":
		case "audio/mp4":
		case "audio/x-m4a":
			if (!preview.link.startsWith("https://")) {
				break;
			}

			if (Config.values.disableMediaPreview) {
				return removePreview(msg, preview);
			}

			preview.type = "audio";
			preview.media = preview.link;
			preview.mediaType = res.type;

			break;

		case "video/webm":
		case "video/ogg":
		case "video/mp4":
			if (!preview.link.startsWith("https://")) {
				break;
			}

			if (Config.values.disableMediaPreview) {
				return removePreview(msg, preview);
			}

			preview.type = "video";
			preview.media = preview.link;
			preview.mediaType = res.type;

			break;

		default:
			return removePreview(msg, preview);
	}

	if (!promise) {
		return handlePreview(client, chan, msg, preview, res);
	}

	void promise.then((newRes) => handlePreview(client, chan, msg, preview, newRes));
}

function handlePreview(client: Client, chan: Chan, msg: Msg, preview: LinkPreview, res) {
	const thumb = preview.thumbActualUrl || "";
	delete preview.thumbActualUrl;

	if (!thumb.length || !Config.values.prefetchStorage) {
		preview.thumb = thumb;
		return emitPreview(client, chan, msg, preview);
	}

	// Get the correct file extension for the provided content-type
	// This is done to prevent user-input being stored in the file name (extension)
	const extension = mime.extension(res.type);

	if (!extension) {
		// For link previews, drop the thumbnail
		// For other types, do not display preview at all
		if (preview.type !== "link") {
			return removePreview(msg, preview);
		}

		return emitPreview(client, chan, msg, preview);
	}

	storage.store(res.data, extension, (uri) => {
		preview.thumb = uri;

		emitPreview(client, chan, msg, preview);
	});
}

function emitPreview(client: Client, chan: Chan, msg: Msg, preview: LinkPreview) {
	// If there is no title but there is preview or description, set title
	// otherwise bail out and show no preview
	if (!preview.head.length && preview.type === "link") {
		if (preview.thumb.length || preview.body.length) {
			preview.head = "Untitled page";
		} else {
			return removePreview(msg, preview);
		}
	}

	client.emit("msg:preview", {
		id: msg.id,
		chan: chan.id,
		preview: preview,
	});
}

function removePreview(msg: Msg, preview: LinkPreview) {
	// If a preview fails to load, remove the link from msg object
	// So that client doesn't attempt to display an preview on page reload
	const index = msg.previews.indexOf(preview);

	if (index > -1) {
		msg.previews.splice(index, 1);
	}
}

function getRequestHeaders(headers: Record<string, string>) {
	const formattedHeaders = {
		// Certain websites like Amazon only add <meta> tags to known bots,
		// lets pretend to be them to get the metadata
		"User-Agent":
			"Mozilla/5.0 (compatible; The Lounge IRC Client; +https://github.com/thelounge/thelounge)" +
			" facebookexternalhit/1.1 Twitterbot/1.0",
		Accept: headers.accept || "*/*",
		"X-Purpose": "preview",
	};

	if (headers.language) {
		formattedHeaders["Accept-Language"] = headers.language;
	}

	return formattedHeaders;
}

function fetch(uri: string, headers: Record<string, string>) {
	// Stringify the object otherwise the objects won't compute to the same value
	const cacheKey = JSON.stringify([uri, headers]);
	let promise = currentFetchPromises.get(cacheKey);

	if (promise) {
		return promise;
	}

	const prefetchTimeout = Config.values.prefetchTimeout;

	if (!prefetchTimeout) {
		log.warn(
			"prefetchTimeout is missing from your The Lounge configuration, defaulting to 5000 ms"
		);
	}

	promise = new Promise<FetchRequest>((resolve, reject) => {
		let buffer = Buffer.from("");
		let contentLength = 0;
		let contentType: string | undefined;
		let limit = Config.values.prefetchMaxImageSize * 1024;

		try {
			const gotStream = got.stream(uri, {
				retry: 0,
				timeout: prefetchTimeout || 5000, // milliseconds
				headers: getRequestHeaders(headers),
				localAddress: Config.values.bind,
			});

			gotStream
				.on("response", function (res) {
					contentLength = parseInt(res.headers["content-length"], 10) || 0;
					contentType = res.headers["content-type"];

					if (contentType && imageTypeRegex.test(contentType)) {
						// response is an image
						// if Content-Length header reports a size exceeding the prefetch limit, abort fetch
						// and if file is not to be stored we don't need to download further either
						if (contentLength > limit || !Config.values.prefetchStorage) {
							gotStream.destroy();
						}
					} else if (contentType && mediaTypeRegex.test(contentType)) {
						// We don't need to download the file any further after we received content-type header
						gotStream.destroy();
					} else {
						// if not image, limit download to the max search size, since we need only meta tags
						// twitter.com sends opengraph meta tags within ~20kb of data for individual tweets, the default is set to 50.
						// for sites like Youtube the og tags are in the first 300K and hence this is configurable by the admin
						limit =
							"prefetchMaxSearchSize" in Config.values
								? Config.values.prefetchMaxSearchSize * 1024
								: // set to the previous size if config option is unset
								  50 * 1024;
					}
				})
				.on("error", (e) => reject(e))
				.on("data", (data) => {
					buffer = Buffer.concat(
						[buffer, data],
						buffer.length + (data as Array<any>).length
					);

					if (buffer.length >= limit) {
						gotStream.destroy();
					}
				})
				.on("end", () => gotStream.destroy())
				.on("close", () => {
					let type = "";

					// If we downloaded more data then specified in Content-Length, use real data size
					const size = contentLength > buffer.length ? contentLength : buffer.length;

					if (contentType) {
						type = contentType.split(/ *; */).shift() || "";
					}

					resolve({data: buffer, type, size});
				});
		} catch (e: any) {
			return reject(e);
		}
	});

	const removeCache = () => currentFetchPromises.delete(cacheKey);

	promise.then(removeCache).catch(removeCache);

	currentFetchPromises.set(cacheKey, promise);

	return promise;
}

function normalizeURL(link: string, baseLink?: string, disallowHttp = false) {
	try {
		const url = new URL(link, baseLink);

		// Only fetch http and https links
		if (url.protocol !== "http:" && url.protocol !== "https:") {
			return undefined;
		}

		if (disallowHttp && url.protocol === "http:") {
			return undefined;
		}

		// Do not fetch links without hostname or ones that contain authorization
		if (!url.hostname || url.username || url.password) {
			return undefined;
		}

		// Drop hash from the url, if any
		url.hash = "";

		return url.toString();
	} catch (e: any) {
		// if an exception was thrown, the url is not valid
	}

	return undefined;
}