* migrated to using PeeWee ORM for increased reliability
* now allow rendering images as actual previews (videos next)
* Improved documentation
This commit is contained in:
Nexus 2024-02-25 22:09:51 +00:00
parent ce13580bd4
commit be73449353
Signed by: nex
GPG key ID: 0FA334385D0B689F
6 changed files with 212 additions and 79 deletions

View file

@ -4,7 +4,8 @@ WORKDIR /app
COPY requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN rm /tmp/requirements.txt
COPY server.py /app/server.py
COPY ./src/ /app/
CMD ["python", "server.py"]

View file

@ -1,7 +1,23 @@
# Drop In URL previews server
*aka DIP / Drop in previews*
A simple python server that handles /_matrix/media/*/preview_url requests, for servers like Dendrite.
You may also want to replace your homeserver's URL preview generator with this one (in case this offers more features).
## Features
DIP is complete with the following features:
* **Full** [`OG/OpenGraph`](https://ogp.me/) tag support
* Partial [`Twitter`](https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/markup) card support
* Supports rendering previews for image files
* Proxying requests through a HTTP/HTTPS/SOCKS4/SOCKS5 proxy
* Custom user agent for requests
* Caching previews to prevent repeated lookups
* Built-in media duplication prevention
## Installation
Just use docker.
@ -20,7 +36,7 @@ services:
- "PREVIEW_HOMESERVER=https://matrix.nexy7574.co.uk"
- "FORWARDED_ALLOW_IPS=*"
ports:
- "2226:2226"
- "2226:2226/tcp"
restart: "unless-stopped"
container_name: "dendrite-url-previews"
volumes:
@ -33,10 +49,15 @@ volumes:
## Configuration
| Environment Variable | Description | Example | Default |
|-----------------------|----------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------|
| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. |
| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` |
| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` |
| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests.
| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` |
| Environment Variable | Description | Example | Default |
|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|-----------------------------------|
| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. |
| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` |
| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` |
| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests. | `http://localhost:1080` | null |
| `PREVIEW_USER_AGENT` | The user agent to use for all network requests. Must be one of `google`, `bing`, `duckduckgo`, `firefox`, `chrome`, `twitter`, `facebook`, `honest` (uses a unique user agent) | `firefox` | `google` |
| `PREVIEW_MAX_MEDIA_MB` | The maximum size of media to proxy in megabytes. Media larger than this downloaded from sites will not be re-uploaded to the homeserver's media repo, and as such cannot be used in the preview response. | `10` | `50` |
| `PREVIEW_DATABASE_URL` | The sqlite://, postgres://, or mysql:// URL to use for the database. | `postgres://user:pass@localhost:5432/dip` | `sqlite:///app/data/db.sqltie3` |
| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` |
| `LOG_LEVEL` | The log level to use. One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`. | `INFO` | `INFO` |
| `LOG_DEBUG_TIDY` | When `LOG_LEVEL` is `DEBUG`, silences some really noisy loggers (like HTTP request loggers) to help you debug this program, not a dependency). | `true` | `false` |

View file

@ -4,3 +4,4 @@ fastapi~=0.109
httpx[socks]~=0.26
beautifulsoup4~=4.12
appdirs~=1.4
peewee~=3.17

0
src/__init__.py Normal file
View file

60
src/db.py Normal file
View file

@ -0,0 +1,60 @@
import uuid as _uuid
import os
import typing
from peewee import *
from pathlib import Path
from appdirs import user_cache_dir
def get_db():
if uri := os.getenv("PREVIEW_DATABASE_URI"):
if uri.startswith("sqlite"):
return SqliteDatabase(uri[9:])
elif uri.startswith("postgres"):
return PostgresqlDatabase(uri)
elif uri.startswith("mysql"):
return MySQLDatabase(uri)
else:
raise ValueError("Unknown database URI. Must be sqlite, postgres or mysql.")
else:
if Path.cwd() == Path("/app"):
_dir = Path("/data")
else:
_dir = Path(user_cache_dir("matrix-url-preview"))
file = _dir / "db.sqlite3"
file.parent.mkdir(parents=True, exist_ok=True)
file.touch(exist_ok=True)
return SqliteDatabase(file)
db = get_db()
class CachedURLs(Model):
uuid = UUIDField(primary_key=True, default=_uuid.uuid4)
url = TextField(null=False)
ts = FloatField(null=False)
metadata = TextField(null=False)
class Meta:
database = db
if typing.TYPE_CHECKING:
uuid: _uuid.UUID
url: str
ts: float
metadata: str
class CachedMedia(Model):
uuid = UUIDField(primary_key=True, default=_uuid.uuid4)
mxc_url = TextField(null=False)
md5 = TextField(null=False, index=True)
class Meta:
database = db
if typing.TYPE_CHECKING:
uuid: _uuid.UUID
mxc_url: str
md5: str

View file

@ -1,5 +1,6 @@
import contextlib
import datetime
import fnmatch
import io
import json
import os
@ -10,8 +11,7 @@ import uuid
import fastapi
import httpx
import logging
import sqlite3
import appdirs
import hashlib
from threading import Lock
from typing import Annotated
from fastapi import Query, Header, HTTPException, Request
@ -20,23 +20,16 @@ from pathlib import Path
from bs4 import BeautifulSoup
from fastapi.middleware.cors import CORSMiddleware
import db
@contextlib.asynccontextmanager
async def startup(_):
if not CACHE_DIR.exists():
CACHE_DIR.mkdir(parents=True)
with sqlite3.connect(CACHE_FILE) as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS cache (
uuid TEXT PRIMARY KEY,
url TEXT NOT NULL,
ts INTEGER NOT NULL,
metadata TEXT NOT NULL
)
"""
)
yield
with db.db:
logging.info("Creating tables")
db.db.create_tables([db.CachedURLs, db.CachedMedia])
db.db.commit()
yield
logging.basicConfig(
@ -73,6 +66,29 @@ INVALID_TOKEN = JSONResponse(
{"errcode": "M_INVALID_TOKEN", "error": "Invalid access token"},
401
)
USER_AGENTS = {
"twitter": "TwitterBot/1.0",
"firefox": "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
"chrome": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, Like Gecko) "
"Chrome/121.9.6167.160 Safari/537.36",
"google": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; "
"+http://www.google.com/bot.html) Chrome/121.9.6167.160 Safari/537.36",
"bing": "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"yahoo": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
"duckduckgo": "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
"facebook": "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
"honest": "MatrixDropInURLPreviewBot/0.1 (+https://git.i-am.nexus/nex/drop-in-url-previews)"
}
os.environ.setdefault("PREVIEW_USER_AGENT", "google")
if os.environ["PREVIEW_USER_AGENT"].lower() not in USER_AGENTS:
raise ValueError(
"Invalid user agent: %r\nMust be one of: %s" % (
os.environ["PREVIEW_USER_AGENT"],
", ".join(USER_AGENTS.keys())
)
)
USER_AGENT = USER_AGENTS[os.environ["PREVIEW_USER_AGENT"].lower()]
logging.debug("Selecting user agent: %r", USER_AGENT)
VALID_OG_TAGS = [
"og:title",
"og:type",
@ -145,24 +161,14 @@ URL_OG_TAGS = [
"og:audio:secure_url"
]
TWITTER_MAPPING = {
"twitter:site": "og:site_name",
"twitter:creator": "og:site_name",
"twitter:image": "og:image",
"twitter:title": "og:title",
"twitter:image:width": "og:image:width",
"twitter:image:height": "og:image:height",
"twitter:image:alt": "og:image:alt",
"twitter:description": "og:description",
}
if Path.cwd() == Path("/app"):
logging.info("Look to be running in a docker container. Cache will be stored in /app/cache.")
CACHE_DIR = Path("/app/cache")
else:
CACHE_DIR = Path(appdirs.user_cache_dir("matrix-url-preview"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHE_FILE = CACHE_DIR / "db.sqlite3"
CACHE_FILE.touch(exist_ok=True)
logging.debug("Cache file: %r", CACHE_FILE)
def upload_media(
client: httpx.Client,
@ -173,6 +179,19 @@ def upload_media(
content_type: str
):
file.seek(0)
# 1000 hurts me because 1024 feels correct, but `MB` does in fact stand for MegaByte, not MebiByte.
if len(file.getvalue()) > int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50")) * 1000 * 1000:
logging.warning(
"Media too large: %.2f Megabytes (max %.2fMB)",
len(file.getvalue()),
int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50"))
)
md5 = hashlib.md5(file.getvalue()).hexdigest()
value = db.CachedMedia.get_or_none(md5=md5)
if value:
logging.info("found cached media for %r - %r", md5, value.mxc_url)
return value.mxc_url
logging.info(
"Creating media at %r called %r with the content type %r and %d bytes",
domain,
@ -197,6 +216,7 @@ def upload_media(
logging.info("Media uploaded successfully")
mxc_url = response.json()["content_uri"]
logging.debug("Media uploaded: %r", mxc_url)
db.CachedMedia.create(mxc_url=mxc_url, md5=md5).save()
return mxc_url
else:
logging.warning("Failed to upload media: HTTP %s", response.status_code)
@ -204,9 +224,33 @@ def upload_media(
return None
def __preview_img(url: str, client: httpx.Client, access_token: str) -> dict:
bio = io.BytesIO()
# noinspection PyArgumentList
with client.stream("GET", url) as response:
for chunk in response.iter_bytes():
bio.write(chunk)
bio.seek(0)
mxc_url = upload_media(
client,
os.environ.get("PREVIEW_HOMESERVER", "https://matrix.org"),
access_token,
bio,
Path(httpx.URL(url).path).name,
response.headers.get("content-type", "image/jpeg")
)
if mxc_url:
return {
"og:image": mxc_url,
"matrix:image:size": len(bio.getvalue())
}
return {}
@app.get("/preview_url")
def preview_url(
req: Request,
res: JSONResponse,
url: Annotated[str, Query(..., description="URL to preview")],
ts: int = Query(None, description="The preferred point in time to return a preview for."),
access_token_qs: str | None = Query(None, alias="access_token", description="Access token to use for the request."),
@ -225,40 +269,38 @@ def preview_url(
else:
return MISSING_TOKEN
with sqlite3.connect(CACHE_FILE) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT metadata,ts FROM cache WHERE url = ?",
(url,)
)
results = cursor.fetchall()
if results:
for result in results:
# find the one with the closest timestamp
metadata, _ts = result
created_at = datetime.datetime.fromtimestamp(_ts - 86400)
if ts is None or created_at <= datetime.datetime.fromtimestamp(ts):
logging.debug("Optimal cache hit for %r", url)
return json.loads(metadata)
else:
logging.debug("No optimal cache matches for url %r.", url)
# No close matches, get the latest one
metadata, _ts = results[-1]
created_at = datetime.datetime.fromtimestamp(_ts)
if (datetime.datetime.now() - created_at).days <= 7:
logging.debug("Stale cache hit for %r", url)
return json.loads(metadata)
else:
logging.debug("Stale cache miss for %r", url)
results = db.CachedURLs.select().where(db.CachedURLs.url == url)
if results:
for result in results:
# find the one with the closest timestamp
metadata = json.loads(result.metadata)
_ts = result.ts
created_at = datetime.datetime.fromtimestamp(_ts - 86400)
if ts is None or created_at <= datetime.datetime.fromtimestamp(ts):
logging.debug("Optimal cache hit for %r", url)
res.headers["X-Cache"] = "optimal"
return metadata
else:
logging.debug("Full cache miss for %r", url)
logging.debug("No optimal cache matches for url %r.", url)
# No close matches, get the latest one
metadata, _ts = results[-1]
created_at = datetime.datetime.fromtimestamp(_ts)
if (datetime.datetime.now() - created_at).days <= 7:
logging.debug("Stale cache hit for %r", url)
res.headers["X-Cache"] = "stale"
return metadata
else:
logging.debug("Stale cache miss for %r", url)
res.headers["X-Cache"] = "stale-miss"
else:
logging.debug("Full cache miss for %r", url)
res.headers["X-Cache"] = "full-miss"
domain = os.environ.get("PREVIEW_HOMESERVER", "https://" + req.url.hostname)
with lock:
with httpx.Client(
headers={
# "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
"User-Agent": "TwitterBot/1.0"
"User-Agent": USER_AGENT
},
timeout=60,
follow_redirects=False,
@ -286,8 +328,21 @@ def preview_url(
logging.debug(f"Failed to fetch {url}", exc_info=True)
raise HTTPException(502, f"Failed to fetch {url} - {e}")
if "text/html" not in response.headers.get("content-type", ""):
return {}
content_type = response.headers.get("content-type", "application/octet-stream")
if fnmatch.fnmatch(content_type, "image/*"):
result = __preview_img(url, client, access_token)
db.CachedURLs.create(
url=url,
ts=round(time.time()),
metadata=json.dumps(result)
).save()
res.headers["Cache-Control"] = "public, max-age=86400"
return result
if "text/html" not in content_type:
res.status_code = 204
res.media_type = "text/plain"
res.headers["Cache-Control"] = "no-store"
return None
soup = BeautifulSoup(response.text, "html.parser")
og_tags = {}
@ -373,18 +428,13 @@ def preview_url(
value = og_tags.pop(key, None)
og_tags["og:" + key] = value
with sqlite3.connect(CACHE_FILE) as conn:
conn.execute(
"INSERT INTO cache (uuid, url, ts, metadata) VALUES (?, ?, ?, ?)",
(str(uuid.uuid4()), url, round(time.time()), json.dumps(og_tags))
)
return JSONResponse(
og_tags,
200,
headers={
"Cache-Control": "public, max-age=86400"
}
)
db.CachedURLs.create(
url=url,
ts=round(time.time()),
metadata=json.dumps(og_tags)
).save()
res.headers["Cache-Control"] = "public, max-age=86400"
return og_tags
if __name__ == "__main__":