diff --git a/Dockerfile b/Dockerfile index 4c4c0ec..fc22687 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,8 @@ WORKDIR /app COPY requirements.txt /tmp/requirements.txt RUN pip install -r /tmp/requirements.txt +RUN rm /tmp/requirements.txt -COPY server.py /app/server.py +COPY ./src/ /app/ CMD ["python", "server.py"] diff --git a/README.md b/README.md index b1ecbb6..95a4412 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,23 @@ # Drop In URL previews server +*aka DIP / Drop in previews* + A simple python server that handles /_matrix/media/*/preview_url requests, for servers like Dendrite. +You may also want to replace your homeserver's URL preview generator with this one (in case this offers more features). + +## Features + +DIP is complete with the following features: + +* **Full** [`OG/OpenGraph`](https://ogp.me/) tag support +* Partial [`Twitter`](https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/markup) card support +* Supports rendering previews for image files +* Proxying requests through a HTTP/HTTPS/SOCKS4/SOCKS5 proxy +* Custom user agent for requests +* Caching previews to prevent repeated lookups +* Built-in media duplication prevention + ## Installation Just use docker. @@ -20,7 +36,7 @@ services: - "PREVIEW_HOMESERVER=https://matrix.nexy7574.co.uk" - "FORWARDED_ALLOW_IPS=*" ports: - - "2226:2226" + - "2226:2226/tcp" restart: "unless-stopped" container_name: "dendrite-url-previews" volumes: @@ -33,10 +49,15 @@ volumes: ## Configuration -| Environment Variable | Description | Example | Default | -|-----------------------|----------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------| -| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. | -| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` | -| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` | -| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests. -| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` | +| Environment Variable | Description | Example | Default | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|-----------------------------------| +| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. | +| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` | +| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` | +| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests. | `http://localhost:1080` | null | +| `PREVIEW_USER_AGENT` | The user agent to use for all network requests. Must be one of `google`, `bing`, `duckduckgo`, `firefox`, `chrome`, `twitter`, `facebook`, `honest` (uses a unique user agent) | `firefox` | `google` | +| `PREVIEW_MAX_MEDIA_MB` | The maximum size of media to proxy in megabytes. Media larger than this downloaded from sites will not be re-uploaded to the homeserver's media repo, and as such cannot be used in the preview response. | `10` | `50` | +| `PREVIEW_DATABASE_URL` | The sqlite://, postgres://, or mysql:// URL to use for the database. | `postgres://user:pass@localhost:5432/dip` | `sqlite:///app/data/db.sqltie3` | +| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` | +| `LOG_LEVEL` | The log level to use. One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`. | `INFO` | `INFO` | +| `LOG_DEBUG_TIDY` | When `LOG_LEVEL` is `DEBUG`, silences some really noisy loggers (like HTTP request loggers) to help you debug this program, not a dependency). | `true` | `false` | diff --git a/requirements.txt b/requirements.txt index 42b5b14..1033700 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ fastapi~=0.109 httpx[socks]~=0.26 beautifulsoup4~=4.12 appdirs~=1.4 +peewee~=3.17 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/db.py b/src/db.py new file mode 100644 index 0000000..d0155be --- /dev/null +++ b/src/db.py @@ -0,0 +1,60 @@ +import uuid as _uuid +import os +import typing +from peewee import * +from pathlib import Path +from appdirs import user_cache_dir + + +def get_db(): + if uri := os.getenv("PREVIEW_DATABASE_URI"): + if uri.startswith("sqlite"): + return SqliteDatabase(uri[9:]) + elif uri.startswith("postgres"): + return PostgresqlDatabase(uri) + elif uri.startswith("mysql"): + return MySQLDatabase(uri) + else: + raise ValueError("Unknown database URI. Must be sqlite, postgres or mysql.") + else: + if Path.cwd() == Path("/app"): + _dir = Path("/data") + else: + _dir = Path(user_cache_dir("matrix-url-preview")) + file = _dir / "db.sqlite3" + file.parent.mkdir(parents=True, exist_ok=True) + file.touch(exist_ok=True) + return SqliteDatabase(file) + + +db = get_db() + + +class CachedURLs(Model): + uuid = UUIDField(primary_key=True, default=_uuid.uuid4) + url = TextField(null=False) + ts = FloatField(null=False) + metadata = TextField(null=False) + + class Meta: + database = db + + if typing.TYPE_CHECKING: + uuid: _uuid.UUID + url: str + ts: float + metadata: str + + +class CachedMedia(Model): + uuid = UUIDField(primary_key=True, default=_uuid.uuid4) + mxc_url = TextField(null=False) + md5 = TextField(null=False, index=True) + + class Meta: + database = db + + if typing.TYPE_CHECKING: + uuid: _uuid.UUID + mxc_url: str + md5: str diff --git a/server.py b/src/server.py similarity index 69% rename from server.py rename to src/server.py index f27ed09..463b786 100644 --- a/server.py +++ b/src/server.py @@ -1,5 +1,6 @@ import contextlib import datetime +import fnmatch import io import json import os @@ -10,8 +11,7 @@ import uuid import fastapi import httpx import logging -import sqlite3 -import appdirs +import hashlib from threading import Lock from typing import Annotated from fastapi import Query, Header, HTTPException, Request @@ -20,23 +20,16 @@ from pathlib import Path from bs4 import BeautifulSoup from fastapi.middleware.cors import CORSMiddleware +import db + @contextlib.asynccontextmanager async def startup(_): - if not CACHE_DIR.exists(): - CACHE_DIR.mkdir(parents=True) - with sqlite3.connect(CACHE_FILE) as conn: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS cache ( - uuid TEXT PRIMARY KEY, - url TEXT NOT NULL, - ts INTEGER NOT NULL, - metadata TEXT NOT NULL - ) - """ - ) - yield + with db.db: + logging.info("Creating tables") + db.db.create_tables([db.CachedURLs, db.CachedMedia]) + db.db.commit() + yield logging.basicConfig( @@ -73,6 +66,29 @@ INVALID_TOKEN = JSONResponse( {"errcode": "M_INVALID_TOKEN", "error": "Invalid access token"}, 401 ) +USER_AGENTS = { + "twitter": "TwitterBot/1.0", + "firefox": "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0", + "chrome": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, Like Gecko) " + "Chrome/121.9.6167.160 Safari/537.36", + "google": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; " + "+http://www.google.com/bot.html) Chrome/121.9.6167.160 Safari/537.36", + "bing": "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "yahoo": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", + "duckduckgo": "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", + "facebook": "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)", + "honest": "MatrixDropInURLPreviewBot/0.1 (+https://git.i-am.nexus/nex/drop-in-url-previews)" +} +os.environ.setdefault("PREVIEW_USER_AGENT", "google") +if os.environ["PREVIEW_USER_AGENT"].lower() not in USER_AGENTS: + raise ValueError( + "Invalid user agent: %r\nMust be one of: %s" % ( + os.environ["PREVIEW_USER_AGENT"], + ", ".join(USER_AGENTS.keys()) + ) + ) +USER_AGENT = USER_AGENTS[os.environ["PREVIEW_USER_AGENT"].lower()] +logging.debug("Selecting user agent: %r", USER_AGENT) VALID_OG_TAGS = [ "og:title", "og:type", @@ -145,24 +161,14 @@ URL_OG_TAGS = [ "og:audio:secure_url" ] TWITTER_MAPPING = { - "twitter:site": "og:site_name", - "twitter:creator": "og:site_name", "twitter:image": "og:image", "twitter:title": "og:title", "twitter:image:width": "og:image:width", "twitter:image:height": "og:image:height", + "twitter:image:alt": "og:image:alt", + "twitter:description": "og:description", } -if Path.cwd() == Path("/app"): - logging.info("Look to be running in a docker container. Cache will be stored in /app/cache.") - CACHE_DIR = Path("/app/cache") -else: - CACHE_DIR = Path(appdirs.user_cache_dir("matrix-url-preview")) -CACHE_DIR.mkdir(parents=True, exist_ok=True) -CACHE_FILE = CACHE_DIR / "db.sqlite3" -CACHE_FILE.touch(exist_ok=True) -logging.debug("Cache file: %r", CACHE_FILE) - def upload_media( client: httpx.Client, @@ -173,6 +179,19 @@ def upload_media( content_type: str ): file.seek(0) + # 1000 hurts me because 1024 feels correct, but `MB` does in fact stand for MegaByte, not MebiByte. + if len(file.getvalue()) > int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50")) * 1000 * 1000: + logging.warning( + "Media too large: %.2f Megabytes (max %.2fMB)", + len(file.getvalue()), + int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50")) + ) + md5 = hashlib.md5(file.getvalue()).hexdigest() + value = db.CachedMedia.get_or_none(md5=md5) + if value: + logging.info("found cached media for %r - %r", md5, value.mxc_url) + return value.mxc_url + logging.info( "Creating media at %r called %r with the content type %r and %d bytes", domain, @@ -197,6 +216,7 @@ def upload_media( logging.info("Media uploaded successfully") mxc_url = response.json()["content_uri"] logging.debug("Media uploaded: %r", mxc_url) + db.CachedMedia.create(mxc_url=mxc_url, md5=md5).save() return mxc_url else: logging.warning("Failed to upload media: HTTP %s", response.status_code) @@ -204,9 +224,33 @@ def upload_media( return None +def __preview_img(url: str, client: httpx.Client, access_token: str) -> dict: + bio = io.BytesIO() + # noinspection PyArgumentList + with client.stream("GET", url) as response: + for chunk in response.iter_bytes(): + bio.write(chunk) + bio.seek(0) + mxc_url = upload_media( + client, + os.environ.get("PREVIEW_HOMESERVER", "https://matrix.org"), + access_token, + bio, + Path(httpx.URL(url).path).name, + response.headers.get("content-type", "image/jpeg") + ) + if mxc_url: + return { + "og:image": mxc_url, + "matrix:image:size": len(bio.getvalue()) + } + return {} + + @app.get("/preview_url") def preview_url( req: Request, + res: JSONResponse, url: Annotated[str, Query(..., description="URL to preview")], ts: int = Query(None, description="The preferred point in time to return a preview for."), access_token_qs: str | None = Query(None, alias="access_token", description="Access token to use for the request."), @@ -225,40 +269,38 @@ def preview_url( else: return MISSING_TOKEN - with sqlite3.connect(CACHE_FILE) as conn: - cursor = conn.cursor() - cursor.execute( - "SELECT metadata,ts FROM cache WHERE url = ?", - (url,) - ) - results = cursor.fetchall() - if results: - for result in results: - # find the one with the closest timestamp - metadata, _ts = result - created_at = datetime.datetime.fromtimestamp(_ts - 86400) - if ts is None or created_at <= datetime.datetime.fromtimestamp(ts): - logging.debug("Optimal cache hit for %r", url) - return json.loads(metadata) - else: - logging.debug("No optimal cache matches for url %r.", url) - # No close matches, get the latest one - metadata, _ts = results[-1] - created_at = datetime.datetime.fromtimestamp(_ts) - if (datetime.datetime.now() - created_at).days <= 7: - logging.debug("Stale cache hit for %r", url) - return json.loads(metadata) - else: - logging.debug("Stale cache miss for %r", url) + results = db.CachedURLs.select().where(db.CachedURLs.url == url) + if results: + for result in results: + # find the one with the closest timestamp + metadata = json.loads(result.metadata) + _ts = result.ts + created_at = datetime.datetime.fromtimestamp(_ts - 86400) + if ts is None or created_at <= datetime.datetime.fromtimestamp(ts): + logging.debug("Optimal cache hit for %r", url) + res.headers["X-Cache"] = "optimal" + return metadata else: - logging.debug("Full cache miss for %r", url) + logging.debug("No optimal cache matches for url %r.", url) + # No close matches, get the latest one + metadata, _ts = results[-1] + created_at = datetime.datetime.fromtimestamp(_ts) + if (datetime.datetime.now() - created_at).days <= 7: + logging.debug("Stale cache hit for %r", url) + res.headers["X-Cache"] = "stale" + return metadata + else: + logging.debug("Stale cache miss for %r", url) + res.headers["X-Cache"] = "stale-miss" + else: + logging.debug("Full cache miss for %r", url) + res.headers["X-Cache"] = "full-miss" domain = os.environ.get("PREVIEW_HOMESERVER", "https://" + req.url.hostname) with lock: with httpx.Client( headers={ - # "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0" - "User-Agent": "TwitterBot/1.0" + "User-Agent": USER_AGENT }, timeout=60, follow_redirects=False, @@ -286,8 +328,21 @@ def preview_url( logging.debug(f"Failed to fetch {url}", exc_info=True) raise HTTPException(502, f"Failed to fetch {url} - {e}") - if "text/html" not in response.headers.get("content-type", ""): - return {} + content_type = response.headers.get("content-type", "application/octet-stream") + if fnmatch.fnmatch(content_type, "image/*"): + result = __preview_img(url, client, access_token) + db.CachedURLs.create( + url=url, + ts=round(time.time()), + metadata=json.dumps(result) + ).save() + res.headers["Cache-Control"] = "public, max-age=86400" + return result + if "text/html" not in content_type: + res.status_code = 204 + res.media_type = "text/plain" + res.headers["Cache-Control"] = "no-store" + return None soup = BeautifulSoup(response.text, "html.parser") og_tags = {} @@ -373,18 +428,13 @@ def preview_url( value = og_tags.pop(key, None) og_tags["og:" + key] = value - with sqlite3.connect(CACHE_FILE) as conn: - conn.execute( - "INSERT INTO cache (uuid, url, ts, metadata) VALUES (?, ?, ?, ?)", - (str(uuid.uuid4()), url, round(time.time()), json.dumps(og_tags)) - ) - return JSONResponse( - og_tags, - 200, - headers={ - "Cache-Control": "public, max-age=86400" - } - ) + db.CachedURLs.create( + url=url, + ts=round(time.time()), + metadata=json.dumps(og_tags) + ).save() + res.headers["Cache-Control"] = "public, max-age=86400" + return og_tags if __name__ == "__main__":