* migrated to using PeeWee ORM for increased reliability
* now allow rendering images as actual previews (videos next)
* Improved documentation
This commit is contained in:
Nexus 2024-02-25 22:09:51 +00:00
parent ce13580bd4
commit be73449353
Signed by: nex
GPG key ID: 0FA334385D0B689F
6 changed files with 212 additions and 79 deletions

View file

@ -4,7 +4,8 @@ WORKDIR /app
COPY requirements.txt /tmp/requirements.txt COPY requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt RUN pip install -r /tmp/requirements.txt
RUN rm /tmp/requirements.txt
COPY server.py /app/server.py COPY ./src/ /app/
CMD ["python", "server.py"] CMD ["python", "server.py"]

View file

@ -1,7 +1,23 @@
# Drop In URL previews server # Drop In URL previews server
*aka DIP / Drop in previews*
A simple python server that handles /_matrix/media/*/preview_url requests, for servers like Dendrite. A simple python server that handles /_matrix/media/*/preview_url requests, for servers like Dendrite.
You may also want to replace your homeserver's URL preview generator with this one (in case this offers more features).
## Features
DIP is complete with the following features:
* **Full** [`OG/OpenGraph`](https://ogp.me/) tag support
* Partial [`Twitter`](https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/markup) card support
* Supports rendering previews for image files
* Proxying requests through a HTTP/HTTPS/SOCKS4/SOCKS5 proxy
* Custom user agent for requests
* Caching previews to prevent repeated lookups
* Built-in media duplication prevention
## Installation ## Installation
Just use docker. Just use docker.
@ -20,7 +36,7 @@ services:
- "PREVIEW_HOMESERVER=https://matrix.nexy7574.co.uk" - "PREVIEW_HOMESERVER=https://matrix.nexy7574.co.uk"
- "FORWARDED_ALLOW_IPS=*" - "FORWARDED_ALLOW_IPS=*"
ports: ports:
- "2226:2226" - "2226:2226/tcp"
restart: "unless-stopped" restart: "unless-stopped"
container_name: "dendrite-url-previews" container_name: "dendrite-url-previews"
volumes: volumes:
@ -33,10 +49,15 @@ volumes:
## Configuration ## Configuration
| Environment Variable | Description | Example | Default | | Environment Variable | Description | Example | Default |
|-----------------------|----------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------| |------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|-----------------------------------|
| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. | | `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. |
| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` | | `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` |
| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` | | `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` |
| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests. | `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests. | `http://localhost:1080` | null |
| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` | | `PREVIEW_USER_AGENT` | The user agent to use for all network requests. Must be one of `google`, `bing`, `duckduckgo`, `firefox`, `chrome`, `twitter`, `facebook`, `honest` (uses a unique user agent) | `firefox` | `google` |
| `PREVIEW_MAX_MEDIA_MB` | The maximum size of media to proxy in megabytes. Media larger than this downloaded from sites will not be re-uploaded to the homeserver's media repo, and as such cannot be used in the preview response. | `10` | `50` |
| `PREVIEW_DATABASE_URL` | The sqlite://, postgres://, or mysql:// URL to use for the database. | `postgres://user:pass@localhost:5432/dip` | `sqlite:///app/data/db.sqltie3` |
| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` |
| `LOG_LEVEL` | The log level to use. One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`. | `INFO` | `INFO` |
| `LOG_DEBUG_TIDY` | When `LOG_LEVEL` is `DEBUG`, silences some really noisy loggers (like HTTP request loggers) to help you debug this program, not a dependency). | `true` | `false` |

View file

@ -4,3 +4,4 @@ fastapi~=0.109
httpx[socks]~=0.26 httpx[socks]~=0.26
beautifulsoup4~=4.12 beautifulsoup4~=4.12
appdirs~=1.4 appdirs~=1.4
peewee~=3.17

0
src/__init__.py Normal file
View file

60
src/db.py Normal file
View file

@ -0,0 +1,60 @@
import uuid as _uuid
import os
import typing
from peewee import *
from pathlib import Path
from appdirs import user_cache_dir
def get_db():
if uri := os.getenv("PREVIEW_DATABASE_URI"):
if uri.startswith("sqlite"):
return SqliteDatabase(uri[9:])
elif uri.startswith("postgres"):
return PostgresqlDatabase(uri)
elif uri.startswith("mysql"):
return MySQLDatabase(uri)
else:
raise ValueError("Unknown database URI. Must be sqlite, postgres or mysql.")
else:
if Path.cwd() == Path("/app"):
_dir = Path("/data")
else:
_dir = Path(user_cache_dir("matrix-url-preview"))
file = _dir / "db.sqlite3"
file.parent.mkdir(parents=True, exist_ok=True)
file.touch(exist_ok=True)
return SqliteDatabase(file)
db = get_db()
class CachedURLs(Model):
uuid = UUIDField(primary_key=True, default=_uuid.uuid4)
url = TextField(null=False)
ts = FloatField(null=False)
metadata = TextField(null=False)
class Meta:
database = db
if typing.TYPE_CHECKING:
uuid: _uuid.UUID
url: str
ts: float
metadata: str
class CachedMedia(Model):
uuid = UUIDField(primary_key=True, default=_uuid.uuid4)
mxc_url = TextField(null=False)
md5 = TextField(null=False, index=True)
class Meta:
database = db
if typing.TYPE_CHECKING:
uuid: _uuid.UUID
mxc_url: str
md5: str

View file

@ -1,5 +1,6 @@
import contextlib import contextlib
import datetime import datetime
import fnmatch
import io import io
import json import json
import os import os
@ -10,8 +11,7 @@ import uuid
import fastapi import fastapi
import httpx import httpx
import logging import logging
import sqlite3 import hashlib
import appdirs
from threading import Lock from threading import Lock
from typing import Annotated from typing import Annotated
from fastapi import Query, Header, HTTPException, Request from fastapi import Query, Header, HTTPException, Request
@ -20,23 +20,16 @@ from pathlib import Path
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import db
@contextlib.asynccontextmanager @contextlib.asynccontextmanager
async def startup(_): async def startup(_):
if not CACHE_DIR.exists(): with db.db:
CACHE_DIR.mkdir(parents=True) logging.info("Creating tables")
with sqlite3.connect(CACHE_FILE) as conn: db.db.create_tables([db.CachedURLs, db.CachedMedia])
conn.execute( db.db.commit()
""" yield
CREATE TABLE IF NOT EXISTS cache (
uuid TEXT PRIMARY KEY,
url TEXT NOT NULL,
ts INTEGER NOT NULL,
metadata TEXT NOT NULL
)
"""
)
yield
logging.basicConfig( logging.basicConfig(
@ -73,6 +66,29 @@ INVALID_TOKEN = JSONResponse(
{"errcode": "M_INVALID_TOKEN", "error": "Invalid access token"}, {"errcode": "M_INVALID_TOKEN", "error": "Invalid access token"},
401 401
) )
USER_AGENTS = {
"twitter": "TwitterBot/1.0",
"firefox": "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
"chrome": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, Like Gecko) "
"Chrome/121.9.6167.160 Safari/537.36",
"google": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; "
"+http://www.google.com/bot.html) Chrome/121.9.6167.160 Safari/537.36",
"bing": "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"yahoo": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
"duckduckgo": "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
"facebook": "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
"honest": "MatrixDropInURLPreviewBot/0.1 (+https://git.i-am.nexus/nex/drop-in-url-previews)"
}
os.environ.setdefault("PREVIEW_USER_AGENT", "google")
if os.environ["PREVIEW_USER_AGENT"].lower() not in USER_AGENTS:
raise ValueError(
"Invalid user agent: %r\nMust be one of: %s" % (
os.environ["PREVIEW_USER_AGENT"],
", ".join(USER_AGENTS.keys())
)
)
USER_AGENT = USER_AGENTS[os.environ["PREVIEW_USER_AGENT"].lower()]
logging.debug("Selecting user agent: %r", USER_AGENT)
VALID_OG_TAGS = [ VALID_OG_TAGS = [
"og:title", "og:title",
"og:type", "og:type",
@ -145,24 +161,14 @@ URL_OG_TAGS = [
"og:audio:secure_url" "og:audio:secure_url"
] ]
TWITTER_MAPPING = { TWITTER_MAPPING = {
"twitter:site": "og:site_name",
"twitter:creator": "og:site_name",
"twitter:image": "og:image", "twitter:image": "og:image",
"twitter:title": "og:title", "twitter:title": "og:title",
"twitter:image:width": "og:image:width", "twitter:image:width": "og:image:width",
"twitter:image:height": "og:image:height", "twitter:image:height": "og:image:height",
"twitter:image:alt": "og:image:alt",
"twitter:description": "og:description",
} }
if Path.cwd() == Path("/app"):
logging.info("Look to be running in a docker container. Cache will be stored in /app/cache.")
CACHE_DIR = Path("/app/cache")
else:
CACHE_DIR = Path(appdirs.user_cache_dir("matrix-url-preview"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHE_FILE = CACHE_DIR / "db.sqlite3"
CACHE_FILE.touch(exist_ok=True)
logging.debug("Cache file: %r", CACHE_FILE)
def upload_media( def upload_media(
client: httpx.Client, client: httpx.Client,
@ -173,6 +179,19 @@ def upload_media(
content_type: str content_type: str
): ):
file.seek(0) file.seek(0)
# 1000 hurts me because 1024 feels correct, but `MB` does in fact stand for MegaByte, not MebiByte.
if len(file.getvalue()) > int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50")) * 1000 * 1000:
logging.warning(
"Media too large: %.2f Megabytes (max %.2fMB)",
len(file.getvalue()),
int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50"))
)
md5 = hashlib.md5(file.getvalue()).hexdigest()
value = db.CachedMedia.get_or_none(md5=md5)
if value:
logging.info("found cached media for %r - %r", md5, value.mxc_url)
return value.mxc_url
logging.info( logging.info(
"Creating media at %r called %r with the content type %r and %d bytes", "Creating media at %r called %r with the content type %r and %d bytes",
domain, domain,
@ -197,6 +216,7 @@ def upload_media(
logging.info("Media uploaded successfully") logging.info("Media uploaded successfully")
mxc_url = response.json()["content_uri"] mxc_url = response.json()["content_uri"]
logging.debug("Media uploaded: %r", mxc_url) logging.debug("Media uploaded: %r", mxc_url)
db.CachedMedia.create(mxc_url=mxc_url, md5=md5).save()
return mxc_url return mxc_url
else: else:
logging.warning("Failed to upload media: HTTP %s", response.status_code) logging.warning("Failed to upload media: HTTP %s", response.status_code)
@ -204,9 +224,33 @@ def upload_media(
return None return None
def __preview_img(url: str, client: httpx.Client, access_token: str) -> dict:
bio = io.BytesIO()
# noinspection PyArgumentList
with client.stream("GET", url) as response:
for chunk in response.iter_bytes():
bio.write(chunk)
bio.seek(0)
mxc_url = upload_media(
client,
os.environ.get("PREVIEW_HOMESERVER", "https://matrix.org"),
access_token,
bio,
Path(httpx.URL(url).path).name,
response.headers.get("content-type", "image/jpeg")
)
if mxc_url:
return {
"og:image": mxc_url,
"matrix:image:size": len(bio.getvalue())
}
return {}
@app.get("/preview_url") @app.get("/preview_url")
def preview_url( def preview_url(
req: Request, req: Request,
res: JSONResponse,
url: Annotated[str, Query(..., description="URL to preview")], url: Annotated[str, Query(..., description="URL to preview")],
ts: int = Query(None, description="The preferred point in time to return a preview for."), ts: int = Query(None, description="The preferred point in time to return a preview for."),
access_token_qs: str | None = Query(None, alias="access_token", description="Access token to use for the request."), access_token_qs: str | None = Query(None, alias="access_token", description="Access token to use for the request."),
@ -225,40 +269,38 @@ def preview_url(
else: else:
return MISSING_TOKEN return MISSING_TOKEN
with sqlite3.connect(CACHE_FILE) as conn: results = db.CachedURLs.select().where(db.CachedURLs.url == url)
cursor = conn.cursor() if results:
cursor.execute( for result in results:
"SELECT metadata,ts FROM cache WHERE url = ?", # find the one with the closest timestamp
(url,) metadata = json.loads(result.metadata)
) _ts = result.ts
results = cursor.fetchall() created_at = datetime.datetime.fromtimestamp(_ts - 86400)
if results: if ts is None or created_at <= datetime.datetime.fromtimestamp(ts):
for result in results: logging.debug("Optimal cache hit for %r", url)
# find the one with the closest timestamp res.headers["X-Cache"] = "optimal"
metadata, _ts = result return metadata
created_at = datetime.datetime.fromtimestamp(_ts - 86400)
if ts is None or created_at <= datetime.datetime.fromtimestamp(ts):
logging.debug("Optimal cache hit for %r", url)
return json.loads(metadata)
else:
logging.debug("No optimal cache matches for url %r.", url)
# No close matches, get the latest one
metadata, _ts = results[-1]
created_at = datetime.datetime.fromtimestamp(_ts)
if (datetime.datetime.now() - created_at).days <= 7:
logging.debug("Stale cache hit for %r", url)
return json.loads(metadata)
else:
logging.debug("Stale cache miss for %r", url)
else: else:
logging.debug("Full cache miss for %r", url) logging.debug("No optimal cache matches for url %r.", url)
# No close matches, get the latest one
metadata, _ts = results[-1]
created_at = datetime.datetime.fromtimestamp(_ts)
if (datetime.datetime.now() - created_at).days <= 7:
logging.debug("Stale cache hit for %r", url)
res.headers["X-Cache"] = "stale"
return metadata
else:
logging.debug("Stale cache miss for %r", url)
res.headers["X-Cache"] = "stale-miss"
else:
logging.debug("Full cache miss for %r", url)
res.headers["X-Cache"] = "full-miss"
domain = os.environ.get("PREVIEW_HOMESERVER", "https://" + req.url.hostname) domain = os.environ.get("PREVIEW_HOMESERVER", "https://" + req.url.hostname)
with lock: with lock:
with httpx.Client( with httpx.Client(
headers={ headers={
# "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0" "User-Agent": USER_AGENT
"User-Agent": "TwitterBot/1.0"
}, },
timeout=60, timeout=60,
follow_redirects=False, follow_redirects=False,
@ -286,8 +328,21 @@ def preview_url(
logging.debug(f"Failed to fetch {url}", exc_info=True) logging.debug(f"Failed to fetch {url}", exc_info=True)
raise HTTPException(502, f"Failed to fetch {url} - {e}") raise HTTPException(502, f"Failed to fetch {url} - {e}")
if "text/html" not in response.headers.get("content-type", ""): content_type = response.headers.get("content-type", "application/octet-stream")
return {} if fnmatch.fnmatch(content_type, "image/*"):
result = __preview_img(url, client, access_token)
db.CachedURLs.create(
url=url,
ts=round(time.time()),
metadata=json.dumps(result)
).save()
res.headers["Cache-Control"] = "public, max-age=86400"
return result
if "text/html" not in content_type:
res.status_code = 204
res.media_type = "text/plain"
res.headers["Cache-Control"] = "no-store"
return None
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
og_tags = {} og_tags = {}
@ -373,18 +428,13 @@ def preview_url(
value = og_tags.pop(key, None) value = og_tags.pop(key, None)
og_tags["og:" + key] = value og_tags["og:" + key] = value
with sqlite3.connect(CACHE_FILE) as conn: db.CachedURLs.create(
conn.execute( url=url,
"INSERT INTO cache (uuid, url, ts, metadata) VALUES (?, ?, ?, ?)", ts=round(time.time()),
(str(uuid.uuid4()), url, round(time.time()), json.dumps(og_tags)) metadata=json.dumps(og_tags)
) ).save()
return JSONResponse( res.headers["Cache-Control"] = "public, max-age=86400"
og_tags, return og_tags
200,
headers={
"Cache-Control": "public, max-age=86400"
}
)
if __name__ == "__main__": if __name__ == "__main__":