0.2.0a1
* migrated to using PeeWee ORM for increased reliability * now allow rendering images as actual previews (videos next) * Improved documentation
This commit is contained in:
parent
ce13580bd4
commit
be73449353
6 changed files with 212 additions and 79 deletions
|
@ -4,7 +4,8 @@ WORKDIR /app
|
|||
|
||||
COPY requirements.txt /tmp/requirements.txt
|
||||
RUN pip install -r /tmp/requirements.txt
|
||||
RUN rm /tmp/requirements.txt
|
||||
|
||||
COPY server.py /app/server.py
|
||||
COPY ./src/ /app/
|
||||
|
||||
CMD ["python", "server.py"]
|
||||
|
|
37
README.md
37
README.md
|
@ -1,7 +1,23 @@
|
|||
# Drop In URL previews server
|
||||
|
||||
*aka DIP / Drop in previews*
|
||||
|
||||
A simple python server that handles /_matrix/media/*/preview_url requests, for servers like Dendrite.
|
||||
|
||||
You may also want to replace your homeserver's URL preview generator with this one (in case this offers more features).
|
||||
|
||||
## Features
|
||||
|
||||
DIP is complete with the following features:
|
||||
|
||||
* **Full** [`OG/OpenGraph`](https://ogp.me/) tag support
|
||||
* Partial [`Twitter`](https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/markup) card support
|
||||
* Supports rendering previews for image files
|
||||
* Proxying requests through a HTTP/HTTPS/SOCKS4/SOCKS5 proxy
|
||||
* Custom user agent for requests
|
||||
* Caching previews to prevent repeated lookups
|
||||
* Built-in media duplication prevention
|
||||
|
||||
## Installation
|
||||
Just use docker.
|
||||
|
||||
|
@ -20,7 +36,7 @@ services:
|
|||
- "PREVIEW_HOMESERVER=https://matrix.nexy7574.co.uk"
|
||||
- "FORWARDED_ALLOW_IPS=*"
|
||||
ports:
|
||||
- "2226:2226"
|
||||
- "2226:2226/tcp"
|
||||
restart: "unless-stopped"
|
||||
container_name: "dendrite-url-previews"
|
||||
volumes:
|
||||
|
@ -33,10 +49,15 @@ volumes:
|
|||
|
||||
## Configuration
|
||||
|
||||
| Environment Variable | Description | Example | Default |
|
||||
|-----------------------|----------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------|
|
||||
| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. |
|
||||
| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` |
|
||||
| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` |
|
||||
| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests.
|
||||
| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` |
|
||||
| Environment Variable | Description | Example | Default |
|
||||
|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|-----------------------------------|
|
||||
| `PREVIEW_HOMESERVER` | The homeserver to use for the previews. | `https://matrix.nexy7574.co.uk` | The host name of the request URL. |
|
||||
| `PREVIEW_HOST` | The host IP/Name to listen to. | `192.168.0.2` | `0.0.0.0` |
|
||||
| `PREVIEW_PORT` | The port to listen to. | `8080` | `2226` |
|
||||
| `PREVIEW_PROXY` | A HTTP/HTTPS/SOCKS4/SOCKS5 proxy to use for all network requests. | `http://localhost:1080` | null |
|
||||
| `PREVIEW_USER_AGENT` | The user agent to use for all network requests. Must be one of `google`, `bing`, `duckduckgo`, `firefox`, `chrome`, `twitter`, `facebook`, `honest` (uses a unique user agent) | `firefox` | `google` |
|
||||
| `PREVIEW_MAX_MEDIA_MB` | The maximum size of media to proxy in megabytes. Media larger than this downloaded from sites will not be re-uploaded to the homeserver's media repo, and as such cannot be used in the preview response. | `10` | `50` |
|
||||
| `PREVIEW_DATABASE_URL` | The sqlite://, postgres://, or mysql:// URL to use for the database. | `postgres://user:pass@localhost:5432/dip` | `sqlite:///app/data/db.sqltie3` |
|
||||
| `FORWARDED_ALLOW_IPS` | The list of reverse proxy IPs to trust. See [Uvicorn docs](https://www.uvicorn.org/settings/#http) | * | `127.0.0.1` |
|
||||
| `LOG_LEVEL` | The log level to use. One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`. | `INFO` | `INFO` |
|
||||
| `LOG_DEBUG_TIDY` | When `LOG_LEVEL` is `DEBUG`, silences some really noisy loggers (like HTTP request loggers) to help you debug this program, not a dependency). | `true` | `false` |
|
||||
|
|
|
@ -4,3 +4,4 @@ fastapi~=0.109
|
|||
httpx[socks]~=0.26
|
||||
beautifulsoup4~=4.12
|
||||
appdirs~=1.4
|
||||
peewee~=3.17
|
||||
|
|
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
60
src/db.py
Normal file
60
src/db.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
import uuid as _uuid
|
||||
import os
|
||||
import typing
|
||||
from peewee import *
|
||||
from pathlib import Path
|
||||
from appdirs import user_cache_dir
|
||||
|
||||
|
||||
def get_db():
|
||||
if uri := os.getenv("PREVIEW_DATABASE_URI"):
|
||||
if uri.startswith("sqlite"):
|
||||
return SqliteDatabase(uri[9:])
|
||||
elif uri.startswith("postgres"):
|
||||
return PostgresqlDatabase(uri)
|
||||
elif uri.startswith("mysql"):
|
||||
return MySQLDatabase(uri)
|
||||
else:
|
||||
raise ValueError("Unknown database URI. Must be sqlite, postgres or mysql.")
|
||||
else:
|
||||
if Path.cwd() == Path("/app"):
|
||||
_dir = Path("/data")
|
||||
else:
|
||||
_dir = Path(user_cache_dir("matrix-url-preview"))
|
||||
file = _dir / "db.sqlite3"
|
||||
file.parent.mkdir(parents=True, exist_ok=True)
|
||||
file.touch(exist_ok=True)
|
||||
return SqliteDatabase(file)
|
||||
|
||||
|
||||
db = get_db()
|
||||
|
||||
|
||||
class CachedURLs(Model):
|
||||
uuid = UUIDField(primary_key=True, default=_uuid.uuid4)
|
||||
url = TextField(null=False)
|
||||
ts = FloatField(null=False)
|
||||
metadata = TextField(null=False)
|
||||
|
||||
class Meta:
|
||||
database = db
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
uuid: _uuid.UUID
|
||||
url: str
|
||||
ts: float
|
||||
metadata: str
|
||||
|
||||
|
||||
class CachedMedia(Model):
|
||||
uuid = UUIDField(primary_key=True, default=_uuid.uuid4)
|
||||
mxc_url = TextField(null=False)
|
||||
md5 = TextField(null=False, index=True)
|
||||
|
||||
class Meta:
|
||||
database = db
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
uuid: _uuid.UUID
|
||||
mxc_url: str
|
||||
md5: str
|
|
@ -1,5 +1,6 @@
|
|||
import contextlib
|
||||
import datetime
|
||||
import fnmatch
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
|
@ -10,8 +11,7 @@ import uuid
|
|||
import fastapi
|
||||
import httpx
|
||||
import logging
|
||||
import sqlite3
|
||||
import appdirs
|
||||
import hashlib
|
||||
from threading import Lock
|
||||
from typing import Annotated
|
||||
from fastapi import Query, Header, HTTPException, Request
|
||||
|
@ -20,23 +20,16 @@ from pathlib import Path
|
|||
from bs4 import BeautifulSoup
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
import db
|
||||
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def startup(_):
|
||||
if not CACHE_DIR.exists():
|
||||
CACHE_DIR.mkdir(parents=True)
|
||||
with sqlite3.connect(CACHE_FILE) as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS cache (
|
||||
uuid TEXT PRIMARY KEY,
|
||||
url TEXT NOT NULL,
|
||||
ts INTEGER NOT NULL,
|
||||
metadata TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
yield
|
||||
with db.db:
|
||||
logging.info("Creating tables")
|
||||
db.db.create_tables([db.CachedURLs, db.CachedMedia])
|
||||
db.db.commit()
|
||||
yield
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
|
@ -73,6 +66,29 @@ INVALID_TOKEN = JSONResponse(
|
|||
{"errcode": "M_INVALID_TOKEN", "error": "Invalid access token"},
|
||||
401
|
||||
)
|
||||
USER_AGENTS = {
|
||||
"twitter": "TwitterBot/1.0",
|
||||
"firefox": "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||
"chrome": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, Like Gecko) "
|
||||
"Chrome/121.9.6167.160 Safari/537.36",
|
||||
"google": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; "
|
||||
"+http://www.google.com/bot.html) Chrome/121.9.6167.160 Safari/537.36",
|
||||
"bing": "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)",
|
||||
"yahoo": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
|
||||
"duckduckgo": "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
|
||||
"facebook": "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
|
||||
"honest": "MatrixDropInURLPreviewBot/0.1 (+https://git.i-am.nexus/nex/drop-in-url-previews)"
|
||||
}
|
||||
os.environ.setdefault("PREVIEW_USER_AGENT", "google")
|
||||
if os.environ["PREVIEW_USER_AGENT"].lower() not in USER_AGENTS:
|
||||
raise ValueError(
|
||||
"Invalid user agent: %r\nMust be one of: %s" % (
|
||||
os.environ["PREVIEW_USER_AGENT"],
|
||||
", ".join(USER_AGENTS.keys())
|
||||
)
|
||||
)
|
||||
USER_AGENT = USER_AGENTS[os.environ["PREVIEW_USER_AGENT"].lower()]
|
||||
logging.debug("Selecting user agent: %r", USER_AGENT)
|
||||
VALID_OG_TAGS = [
|
||||
"og:title",
|
||||
"og:type",
|
||||
|
@ -145,24 +161,14 @@ URL_OG_TAGS = [
|
|||
"og:audio:secure_url"
|
||||
]
|
||||
TWITTER_MAPPING = {
|
||||
"twitter:site": "og:site_name",
|
||||
"twitter:creator": "og:site_name",
|
||||
"twitter:image": "og:image",
|
||||
"twitter:title": "og:title",
|
||||
"twitter:image:width": "og:image:width",
|
||||
"twitter:image:height": "og:image:height",
|
||||
"twitter:image:alt": "og:image:alt",
|
||||
"twitter:description": "og:description",
|
||||
}
|
||||
|
||||
if Path.cwd() == Path("/app"):
|
||||
logging.info("Look to be running in a docker container. Cache will be stored in /app/cache.")
|
||||
CACHE_DIR = Path("/app/cache")
|
||||
else:
|
||||
CACHE_DIR = Path(appdirs.user_cache_dir("matrix-url-preview"))
|
||||
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CACHE_FILE = CACHE_DIR / "db.sqlite3"
|
||||
CACHE_FILE.touch(exist_ok=True)
|
||||
logging.debug("Cache file: %r", CACHE_FILE)
|
||||
|
||||
|
||||
def upload_media(
|
||||
client: httpx.Client,
|
||||
|
@ -173,6 +179,19 @@ def upload_media(
|
|||
content_type: str
|
||||
):
|
||||
file.seek(0)
|
||||
# 1000 hurts me because 1024 feels correct, but `MB` does in fact stand for MegaByte, not MebiByte.
|
||||
if len(file.getvalue()) > int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50")) * 1000 * 1000:
|
||||
logging.warning(
|
||||
"Media too large: %.2f Megabytes (max %.2fMB)",
|
||||
len(file.getvalue()),
|
||||
int(os.getenv("PREVIEW_MAX_MEDIA_MB", "50"))
|
||||
)
|
||||
md5 = hashlib.md5(file.getvalue()).hexdigest()
|
||||
value = db.CachedMedia.get_or_none(md5=md5)
|
||||
if value:
|
||||
logging.info("found cached media for %r - %r", md5, value.mxc_url)
|
||||
return value.mxc_url
|
||||
|
||||
logging.info(
|
||||
"Creating media at %r called %r with the content type %r and %d bytes",
|
||||
domain,
|
||||
|
@ -197,6 +216,7 @@ def upload_media(
|
|||
logging.info("Media uploaded successfully")
|
||||
mxc_url = response.json()["content_uri"]
|
||||
logging.debug("Media uploaded: %r", mxc_url)
|
||||
db.CachedMedia.create(mxc_url=mxc_url, md5=md5).save()
|
||||
return mxc_url
|
||||
else:
|
||||
logging.warning("Failed to upload media: HTTP %s", response.status_code)
|
||||
|
@ -204,9 +224,33 @@ def upload_media(
|
|||
return None
|
||||
|
||||
|
||||
def __preview_img(url: str, client: httpx.Client, access_token: str) -> dict:
|
||||
bio = io.BytesIO()
|
||||
# noinspection PyArgumentList
|
||||
with client.stream("GET", url) as response:
|
||||
for chunk in response.iter_bytes():
|
||||
bio.write(chunk)
|
||||
bio.seek(0)
|
||||
mxc_url = upload_media(
|
||||
client,
|
||||
os.environ.get("PREVIEW_HOMESERVER", "https://matrix.org"),
|
||||
access_token,
|
||||
bio,
|
||||
Path(httpx.URL(url).path).name,
|
||||
response.headers.get("content-type", "image/jpeg")
|
||||
)
|
||||
if mxc_url:
|
||||
return {
|
||||
"og:image": mxc_url,
|
||||
"matrix:image:size": len(bio.getvalue())
|
||||
}
|
||||
return {}
|
||||
|
||||
|
||||
@app.get("/preview_url")
|
||||
def preview_url(
|
||||
req: Request,
|
||||
res: JSONResponse,
|
||||
url: Annotated[str, Query(..., description="URL to preview")],
|
||||
ts: int = Query(None, description="The preferred point in time to return a preview for."),
|
||||
access_token_qs: str | None = Query(None, alias="access_token", description="Access token to use for the request."),
|
||||
|
@ -225,40 +269,38 @@ def preview_url(
|
|||
else:
|
||||
return MISSING_TOKEN
|
||||
|
||||
with sqlite3.connect(CACHE_FILE) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT metadata,ts FROM cache WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
results = cursor.fetchall()
|
||||
if results:
|
||||
for result in results:
|
||||
# find the one with the closest timestamp
|
||||
metadata, _ts = result
|
||||
created_at = datetime.datetime.fromtimestamp(_ts - 86400)
|
||||
if ts is None or created_at <= datetime.datetime.fromtimestamp(ts):
|
||||
logging.debug("Optimal cache hit for %r", url)
|
||||
return json.loads(metadata)
|
||||
else:
|
||||
logging.debug("No optimal cache matches for url %r.", url)
|
||||
# No close matches, get the latest one
|
||||
metadata, _ts = results[-1]
|
||||
created_at = datetime.datetime.fromtimestamp(_ts)
|
||||
if (datetime.datetime.now() - created_at).days <= 7:
|
||||
logging.debug("Stale cache hit for %r", url)
|
||||
return json.loads(metadata)
|
||||
else:
|
||||
logging.debug("Stale cache miss for %r", url)
|
||||
results = db.CachedURLs.select().where(db.CachedURLs.url == url)
|
||||
if results:
|
||||
for result in results:
|
||||
# find the one with the closest timestamp
|
||||
metadata = json.loads(result.metadata)
|
||||
_ts = result.ts
|
||||
created_at = datetime.datetime.fromtimestamp(_ts - 86400)
|
||||
if ts is None or created_at <= datetime.datetime.fromtimestamp(ts):
|
||||
logging.debug("Optimal cache hit for %r", url)
|
||||
res.headers["X-Cache"] = "optimal"
|
||||
return metadata
|
||||
else:
|
||||
logging.debug("Full cache miss for %r", url)
|
||||
logging.debug("No optimal cache matches for url %r.", url)
|
||||
# No close matches, get the latest one
|
||||
metadata, _ts = results[-1]
|
||||
created_at = datetime.datetime.fromtimestamp(_ts)
|
||||
if (datetime.datetime.now() - created_at).days <= 7:
|
||||
logging.debug("Stale cache hit for %r", url)
|
||||
res.headers["X-Cache"] = "stale"
|
||||
return metadata
|
||||
else:
|
||||
logging.debug("Stale cache miss for %r", url)
|
||||
res.headers["X-Cache"] = "stale-miss"
|
||||
else:
|
||||
logging.debug("Full cache miss for %r", url)
|
||||
res.headers["X-Cache"] = "full-miss"
|
||||
|
||||
domain = os.environ.get("PREVIEW_HOMESERVER", "https://" + req.url.hostname)
|
||||
with lock:
|
||||
with httpx.Client(
|
||||
headers={
|
||||
# "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
|
||||
"User-Agent": "TwitterBot/1.0"
|
||||
"User-Agent": USER_AGENT
|
||||
},
|
||||
timeout=60,
|
||||
follow_redirects=False,
|
||||
|
@ -286,8 +328,21 @@ def preview_url(
|
|||
logging.debug(f"Failed to fetch {url}", exc_info=True)
|
||||
raise HTTPException(502, f"Failed to fetch {url} - {e}")
|
||||
|
||||
if "text/html" not in response.headers.get("content-type", ""):
|
||||
return {}
|
||||
content_type = response.headers.get("content-type", "application/octet-stream")
|
||||
if fnmatch.fnmatch(content_type, "image/*"):
|
||||
result = __preview_img(url, client, access_token)
|
||||
db.CachedURLs.create(
|
||||
url=url,
|
||||
ts=round(time.time()),
|
||||
metadata=json.dumps(result)
|
||||
).save()
|
||||
res.headers["Cache-Control"] = "public, max-age=86400"
|
||||
return result
|
||||
if "text/html" not in content_type:
|
||||
res.status_code = 204
|
||||
res.media_type = "text/plain"
|
||||
res.headers["Cache-Control"] = "no-store"
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
og_tags = {}
|
||||
|
@ -373,18 +428,13 @@ def preview_url(
|
|||
value = og_tags.pop(key, None)
|
||||
og_tags["og:" + key] = value
|
||||
|
||||
with sqlite3.connect(CACHE_FILE) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO cache (uuid, url, ts, metadata) VALUES (?, ?, ?, ?)",
|
||||
(str(uuid.uuid4()), url, round(time.time()), json.dumps(og_tags))
|
||||
)
|
||||
return JSONResponse(
|
||||
og_tags,
|
||||
200,
|
||||
headers={
|
||||
"Cache-Control": "public, max-age=86400"
|
||||
}
|
||||
)
|
||||
db.CachedURLs.create(
|
||||
url=url,
|
||||
ts=round(time.time()),
|
||||
metadata=json.dumps(og_tags)
|
||||
).save()
|
||||
res.headers["Cache-Control"] = "public, max-age=86400"
|
||||
return og_tags
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
Loading…
Reference in a new issue