drop-in-url-previews/server.py

370 lines
13 KiB
Python

import contextlib
import io
import json
import os
import textwrap
import time
import uuid
import fastapi
import httpx
import logging
import sqlite3
import appdirs
from threading import Lock
from typing import Annotated
from fastapi import Query, Header, HTTPException, Request
from fastapi.responses import JSONResponse
from pathlib import Path
from bs4 import BeautifulSoup
from fastapi.middleware.cors import CORSMiddleware
@contextlib.asynccontextmanager
async def startup(_):
if not CACHE_DIR.exists():
CACHE_DIR.mkdir(parents=True)
with sqlite3.connect(CACHE_FILE) as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS cache (
uuid TEXT PRIMARY KEY,
url TEXT NOT NULL,
ts INTEGER NOT NULL,
metadata TEXT NOT NULL
)
"""
)
yield
logging.basicConfig(
level=logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO").upper()),
format="%(asctime)s:%(levelname)s:%(name)s:%(message)s",
datefmt="%d/%m/%Y %H:%M:%S"
)
logging.getLogger("httpcore.connection").setLevel(logging.INFO)
logging.getLogger("httpcore.http11").setLevel(logging.INFO)
logging.getLogger("httpx").setLevel(logging.INFO)
app = fastapi.FastAPI(
root_path=os.environ.get("PREVIEW_ROOT_PATH", ""),
lifespan=startup
)
lock = Lock()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["GET", "OPTIONS"],
allow_headers=["*", "Authorization"],
)
MISSING_TOKEN = JSONResponse(
{"errcode": "M_MISSING_TOKEN", "error": "Missing access token"},
401
)
INVALID_TOKEN = JSONResponse(
{"errcode": "M_INVALID_TOKEN", "error": "Invalid access token"},
401
)
VALID_OG_TAGS = [
"og:title",
"og:type",
"og:image",
"og:url",
"og:audio",
"og:description",
"og:determiner",
"og:locale",
"og:locale:alternative",
"og:site_name",
"og:image:url",
"og:image:secure_url",
"og:image:type",
"og:image:width",
"og:image:height",
"og:image:alt",
"og:video",
"og:video:url",
"og:video:secure_url",
"og:video:type",
"og:video:width",
"og:video:height",
"og:video:alt",
"og:video:actor",
"og:video:actor:role",
"og:video:director",
"og:video:writer",
"og:video:duration",
"og:video:release_date",
"og:video:tag",
"og:video:series"
"og:audio:url",
"og:audio:secure_url",
"og:audio:type",
"og:music:duration",
"og:music:album",
"og:music:album:disc",
"og:music:album:track",
"og:music:musician",
"og:music:song",
"og:music:song:disc",
"og:music:song:track",
"og:music:release_date",
"og:music:creator",
"og:article:published_time",
"og:article:modified_time",
"og:article:expiration_time",
"og:article:author",
"og:article:section",
"og:article:tag",
"og:book:author",
"og:book:tag",
"og:book:isbn",
"og:book:release_date",
"og:profile:first_name",
"og:profile:last_name",
"og:profile:username",
"og:profile:gender"
]
URL_OG_TAGS = [
"og:video",
"og:video:url",
"og:video:secure_url",
"og:image",
"og:image:url",
"og:image:secure_url",
"og:audio",
"og:audio:url",
"og:audio:secure_url"
]
TWITTER_MAPPING = {
"twitter:site": "og:site_name",
"twitter:creator": "og:site_name",
"twitter:image": "og:image",
"twitter:title": "og:title",
"twitter:image:width": "og:image:width",
"twitter:image:height": "og:image:height",
}
if Path.cwd() == Path("/app"):
logging.info("Look to be running in a docker container. Cache will be stored in /app/cache.")
CACHE_DIR = Path("/app/cache")
else:
CACHE_DIR = Path(appdirs.user_cache_dir("matrix-url-preview"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHE_FILE = CACHE_DIR / "db.sqlite3"
CACHE_FILE.touch(exist_ok=True)
logging.debug("Cache file: %r", CACHE_FILE)
def upload_media(domain: str, access_token: str, file: io.BytesIO, filename: str, content_type: str):
file.seek(0)
logging.info(
"Creating media at %r called %r with the content type %r and %d bytes",
domain,
filename,
content_type,
len(file.getvalue())
)
# noinspection PyTypeChecker
response = httpx.post(
"%s/_matrix/media/r0/upload" % domain,
headers={
"Authorization": f"Bearer {access_token}",
"Content-Type": content_type
},
data=file.getvalue(),
params={
"filename": filename
}
)
if response.status_code == 200:
logging.info("Media uploaded successfully")
mxc_url = response.json()["content_uri"]
logging.debug("Media uploaded: %r", mxc_url)
return mxc_url
else:
logging.warning("Failed to upload media: HTTP %s", response.status_code)
logging.debug("Response: %r", response.text)
return None
@app.get("/preview_url")
def preview_url(
req: Request,
url: Annotated[str, Query(..., description="URL to preview")],
ts: int = Query(None, description="The preferred point in time to return a preview for."),
access_token_qs: str | None = Query(None, alias="access_token", description="Access token to use for the request."),
access_token_header: str | None = Header(
None,
alias="Authorization",
description="Access token to use for the request."
),
):
if access_token_qs is not None:
access_token = access_token_qs
elif access_token_header and access_token_header.startswith("Bearer "):
access_token = access_token_header.split("Bearer ")[1]
else:
return MISSING_TOKEN
with sqlite3.connect(CACHE_FILE) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT metadata,ts FROM cache WHERE url = ?",
(url,)
)
results = cursor.fetchall()
if results:
for result in results:
# find the one with the closest timestamp
metadata, _ts = result
if ts is None or abs(ts - _ts) < 3600:
logging.debug("Optimal cache hit for %r", url)
return json.loads(metadata)
else:
logging.debug("No optimal cache matches for url %r.", url)
# No close matches, get the latest one
metadata, _ts = results[-1]
# If the latest one is more than a week old, re-fetch. Otherwise, return.
if ts is None or abs(ts - _ts) < 604800:
logging.debug("Stale cache hit for %r", url)
return json.loads(metadata)
else:
logging.debug("Cache miss for %r")
else:
logging.debug("Full cache miss for %r", url)
domain = os.environ.get("PREVIEW_HOMESERVER", "https://" + req.url.hostname)
with lock:
try:
with httpx.Client(
headers={
# "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
"User-Agent": "TwitterBot/1.0"
},
timeout=60,
follow_redirects=False
) as client:
response = client.get(
url,
)
if response.status_code not in range(200, 400):
response.raise_for_status()
while response.next_request and response.next_request.url.host == response.url.host:
response = client.send(response.next_request)
if response.status_code not in range(200, 400):
response.raise_for_status()
except httpx.HTTPError as e:
raise HTTPException(500, f"Failed to fetch URL: {e!r}")
if "text/html" not in response.headers.get("content-type", ""):
return {}
soup = BeautifulSoup(response.text, "html.parser")
og_tags = {}
for tag in soup.find_all("meta"):
logging.debug("Found meta tag: %r", tag)
if tag.get("property", "").startswith(("og:", "twitter:")):
logging.debug(
"Tag %r is an OG/Twitter tag, with property: %r",
tag.get("property", "N/A"),
textwrap.shorten(tag.get("content", "N/A"), 100),
)
tag_name = tag.get("property")
if tag_name in (*VALID_OG_TAGS, *TWITTER_MAPPING.keys()):
og_tags[tag_name] = tag.get("content")
for tag in og_tags.copy().keys():
if tag.startswith("twitter:"):
if tag in TWITTER_MAPPING:
og_tags[TWITTER_MAPPING[tag]] = og_tags.pop(tag)
logging.debug("Mapped twitter tag %r to og tag %r", tag, TWITTER_MAPPING[tag])
else:
logging.warning("Unrecognized Twitter tag: %r", tag)
og_tags.pop(tag, None)
for tag_name in URL_OG_TAGS:
if tag_name in og_tags:
logging.debug("Retrieving tag %r to see if it needs uploading to Matrix", tag_name)
_url = og_tags[tag_name]
logging.debug("%r = %r", tag_name, _url)
try:
# noinspection PyArgumentList
with httpx.stream(
url=_url,
method="GET",
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
},
timeout=60,
follow_redirects=True
) as response_media:
if response_media.status_code not in range(200, 300):
logging.warning("Failed to fetch media: %r - HTTP %s", _url, response_media.status_code)
og_tags.pop(tag_name, None)
elif not response_media.headers.get("content-type", "").startswith(("image/", "video/", "audio/")):
logging.warning("Failed to fetch media: %r - not a media type", _url)
og_tags.pop(tag_name, None)
else:
logging.info(
"Downloading {:,} bytes of media: {!r}".format(
int(response_media.headers.get("content-length", 0)),
_url
)
)
_file = io.BytesIO()
_file.write(response_media.read())
_file.seek(0)
upload_response = upload_media(
domain,
access_token,
_file,
Path(httpx.URL(_url).path).name,
response_media.headers.get("content-type", "")
)
if upload_response:
og_tags["original:" + tag_name.replace("og:", "")] = og_tags[tag_name]
og_tags[tag_name] = upload_response
if tag_name in ["og:image", "og:image:url", "og:image:secure_url"]:
_file.seek(0)
og_tags["matrix:image:size"] = len(_file.getvalue())
logging.info("Uploaded media: %r, set %r to %r", _url, tag_name, upload_response)
else:
logging.warning("Failed to upload media: %r (no returned mxc)", _url)
except httpx.HTTPError as e:
logging.exception("Failed to fetch url for OG tags @ %r: %r", _url, e, exc_info=True)
og_tags.pop(tag_name, None)
for key in og_tags.copy().keys():
if not key.startswith(("original:", "og:", "matrix:")):
value = og_tags.pop(key, None)
og_tags["og:" + key] = value
with sqlite3.connect(CACHE_FILE) as conn:
conn.execute(
"INSERT INTO cache (uuid, url, ts, metadata) VALUES (?, ?, ?, ?)",
(str(uuid.uuid4()), url, round(time.time()), json.dumps(og_tags))
)
return JSONResponse(
og_tags,
200,
headers={
"Cache-Control": "public, max-age=86400"
}
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(
app,
host=os.getenv("PREVIEW_HOST", "0.0.0.0"),
port=int(os.getenv("PREVIEW_PORT", 2226)),
# If you want to enable reverse-proxy support, you must set the $FORWARDED_ALLOW_IPS environment variable.
# See: https://www.uvicorn.org/settings/#http
)