college-bot-v2/src/cogs/ytdl.py

import asyncio
import functools
import hashlib
import logging
import math
import time
import datetime

import httpx
import subprocess
import tempfile
import textwrap
import typing
import uuid
from pathlib import Path
from urllib.parse import urlparse

import aiosqlite
import discord
import yt_dlp
from discord.ext import commands

COOKIES_TXT = Path.cwd() / "cookies.txt"


class YTDLCog(commands.Cog):
    def __init__(self, bot: commands.Bot) -> None:
        self.bot = bot
        self.log = logging.getLogger("jimmy.cogs.ytdl")
        self.common_formats = {
            "144p": "bv[width<=144]+ba[ext=webm]/bv[width<=144]+ba[ext=m4a]/bv[width<=144]+ba/b[width<=144]",
            "240p": "bv[width<=240]+ba[ext=webm]/bv[width<=240]+ba[ext=m4a]/bv[width<=240]+ba/b[width<=240]",
            "360p": "bv[width<=360]+ba[ext=webm]/bv[width<=360]+ba[ext=m4a]/bv[width<=360]+ba/b[width<=360]",
            "480p": "bv[width<=500]+ba[ext=webm]/bv[width<=500]+ba[ext=m4a]/bv[width<=500]+bab[width<=480]",
            "720p": "bv[width<=720]+ba[ext=webm]/bv[width<=720]+ba[ext=m4a]/bv[width<=720]+ba/b[width<=720]",
            "1080p": "bv[width<=1080]+ba[ext=webm]/bv[width<=1080]+ba[ext=m4a]/bv[width<=1080]+ba",
            "1440p": "bv[width<=1440]+ba[ext=webm]/bv[width<=1440]+ba[ext=m4a]/bv[width<=1440]+ba",
            "2160p": "bv[width<=2160]+ba[ext=webm]/bv[width<=2160]+ba[ext=m4a]/bv[width<=2160]+ba",
            "mp3": "ba[filesize<500M]",
            "m4a": "ba[ext=m4a][filesize<500M]",
            "opus": "ba[ext=webm][filesize<500M]",
            "vorbis": "ba[ext=webm][filesize<500M]",
            "ogg": "ba[ext=webm][filesize<500M]",
        }
        self.default_options = {
            "noplaylist": True,
            "nocheckcertificate": True,
            "no_color": True,
            "noprogress": True,
            "logger": self.log,
            "format": "((bv+ba/b)[vcodec!=h265][filesize<500M]/b[filesize<=500M]/b)",
            "outtmpl": "%(title).50s.%(ext)s",
            "format_sort": [
                "vcodec:h264",
                "acodec:aac",
                "vcodec:vp9",
                "acodec:opus",
                "acodec:vorbis",
                "vcodec:vp8",
                "ext",
            ],
            "merge_output_format": "webm/mp4/mov/m4a/oga/ogg/mp3/mka/mkv",
            "source_address": "0.0.0.0",
            "concurrent_fragment_downloads": 4,
            # "max_filesize": (25 * 1024 * 1024) - 256
        }
        self.colours = {
            "youtube.com": 0xFF0000,
            "youtu.be": 0xFF0000,
            "tiktok.com": 0x25F5EF,
            "instagram.com": 0xE1306C,
            "shronk.net": 0xFFF952,
        }

    async def _init_db(self):
        async with aiosqlite.connect("./data/ytdl.db") as db:
            await db.execute(
                """
                CREATE TABLE IF NOT EXISTS downloads (
                    key TEXT PRIMARY KEY,
                    message_id INTEGER NOT NULL UNIQUE,
                    channel_id INTEGER NOT NULL,
                    webpage_url TEXT NOT NULL,
                    format_id TEXT NOT NULL,
                    attachment_index INTEGER NOT NULL DEFAULT 0
                )
                """
            )
            await db.commit()
        return

    async def save_link(
        self,
        message: discord.Message,
        webpage_url: str,
        format_id: str,
        attachment_index: int = 0,
        *,
        snip: typing.Optional[str] = None,
    ):
        """
        Saves a link to discord to prevent having to re-download it.
        :param message: The download message with the attachment.
        :param webpage_url: The "webpage_url" key of the metadata
        :param format_id: The "format_Id" key of the metadata
        :param attachment_index: The index of the attachment. Defaults to 0
        :param snip: The start and end time to snip the video. e.g. 00:00:00-00:10:00
        :return: The created hash key
        """
        snip = snip or "*"
        _hash = hashlib.md5(f"{webpage_url}:{format_id}:{snip}".encode()).hexdigest()
        try:
            await self._init_db()
        except Exception as e:
            logging.error("Failed to initialise ytdl database: %s", e, exc_info=True)
            return
        async with aiosqlite.connect("./data/ytdl.db") as db:
            self.log.debug(
                "Saving %r (%r:%r:%r) with message %d>%d, index %d",
                _hash,
                webpage_url,
                format_id,
                snip,
                message.channel.id,
                message.id,
                attachment_index,
            )
            await db.execute(
                """
                INSERT INTO downloads (key, message_id, channel_id, webpage_url, format_id, attachment_index)
                VALUES (?, ?, ?, ?, ?, ?)
                ON CONFLICT (key) DO UPDATE SET
                    message_id=excluded.message_id,
                    channel_id=excluded.channel_id,
                    attachment_index=excluded.attachment_index
                """,
                (_hash, message.id, message.channel.id, webpage_url, format_id, attachment_index),
            )
            await db.commit()
            return _hash

    async def get_saved(self, webpage_url: str, format_id: str, snip: str) -> typing.Optional[str]:
        """
        Attempts to retrieve the attachment URL of a previously saved download.
        :param webpage_url: The webpage url
        :param format_id: The format ID
        :param snip: The start and end time to snip the video. e.g. 00:00:00-00:10:00
        :return: the URL, if found and valid.
        """
        try:
            await self._init_db()
        except Exception as e:
            logging.error("Failed to initialise ytdl database: %s", e, exc_info=True)
            return
        async with aiosqlite.connect("./data/ytdl.db") as db:
            _hash = hashlib.md5(f"{webpage_url}:{format_id}:{snip}".encode()).hexdigest()
            self.log.debug(
                "Attempting to find a saved download for '%s:%s:%s' (%r).", webpage_url, format_id, snip, _hash
            )
            cursor = await db.execute(
                "SELECT message_id, channel_id, attachment_index FROM downloads WHERE key=?", (_hash,)
            )
            entry = await cursor.fetchone()
            if not entry:
                self.log.debug("There was no saved download.")
                return
            message_id, channel_id, attachment_index = entry
            channel = self.bot.get_channel(channel_id)
            if not channel:
                self.log.debug("Channel %r was not found.", channel_id)
                return
            try:
                message = await channel.fetch_message(message_id)
            except discord.HTTPException:
                self.log.debug("%r did not contain a message with ID %r", channel, message_id)
                await db.execute("DELETE FROM downloads WHERE key=?", (_hash,))
                return

            try:
                url = message.attachments[attachment_index].url
                self.log.debug("Found URL %r, returning.", url)
                return url
            except IndexError:
                self.log.debug("Attachment index %d is out of range (%r)", attachment_index, message.attachments)
                return

    def convert_to_m4a(self, file: Path) -> Path:
        """
        Converts a file to m4a format.
        :param file: The file to convert
        :return: The converted file
        """
        new_file = file.with_suffix(".m4a")
        args = [
            "-vn",
            "-sn",
            "-i",
            str(file),
            "-c:a",
            "aac",
            "-b:a",
            "96k",
            "-movflags",
            "faststart",
            "-y",
            str(new_file),
        ]
        self.log.debug("Running command: ffmpeg %s", " ".join(args))
        process = subprocess.run(["ffmpeg", *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if process.returncode != 0:
            raise RuntimeError(process.stderr.decode())
        return new_file

    @staticmethod
    async def upload_to_0x0(name: str, data: typing.IO[bytes], mime_type: str | None = None) -> str:
        if not mime_type:
            import magic
            mime_type = await asyncio.to_thread(magic.from_buffer, data.read(4096), mime=True)
        data.seek(0)
        async with httpx.AsyncClient() as client:
            response = await client.post(
                "https://0x0.st",
                files={"file": (name, data, mime_type)},
                headers={"User-Agent": "CollegeBot (matrix: @nex:nexy7574.co.uk)"},
            )
            if response.status_code == 200:
                return urlparse(response.text).path[1:]
            response.raise_for_status()

    @commands.slash_command(name="yt-dl")
    # @commands.bot_has_permissions(send_messages=True, embed_links=True, attach_files=True)
    async def yt_dl_command(
        self,
        ctx: discord.ApplicationContext,
        url: typing.Annotated[str, discord.Option(str, description="The URL to download from.", required=True)],
        user_format: typing.Annotated[
            typing.Optional[str],
            discord.Option(
                str,
                name="format",
                description="The name of the format to download. Can also specify resolutions for youtube.",
                required=False,
                default=None,
            ),
        ],
        audio_only: typing.Annotated[
            bool,
            discord.Option(
                bool,
                name="audio-only",
                description="Whether to convert result into an m4a file. Overwrites `format` if True.",
                required=False,
                default=False,
            ),
        ],
        snip: typing.Annotated[
            typing.Optional[str],
            discord.Option(description="A start and end position to trim. e.g. 00:00:00-00:10:00.", required=False),
        ],
        subtitles: typing.Annotated[
            typing.Optional[str],
            discord.Option(
                str,
                description="The language code of the subtitles to download. e.g. 'en', 'auto'",
                required=False,
            ),
        ]
    ):
        """Runs yt-dlp and outputs into discord."""
        await ctx.defer()
        last_edit = time.time()
        options = self.default_options.copy()
        stop = asyncio.Event()

        def _download_hook(_data: dict[str, typing.Any]):
            if stop.is_set():
                raise RuntimeError("Download cancelled.")
            n = time.time()
            _total = _data.get("total_bytes", _data.get("total_bytes_estimate")) or ctx.guild.filesize_limit
            if _total:
                _percent = round((_data.get("downloaded_bytes") or 0) / _total * 100, 2)
            else:
                _total = max(1, _data.get("fragment_count", 4096) or 4096)
                _percent = round(max(_data.get("fragment_index", 1) or 1, 1) / _total * 100, 2)
            _speed_bytes_per_second = _data.get("speed", 1) or 1 or 1
            _speed_megabits_per_second = round((_speed_bytes_per_second * 8) / 1024 / 1024)
            if _data.get("eta"):
                _eta = discord.utils.utcnow() + datetime.timedelta(seconds=_data.get("eta"))
            else:
                _eta = discord.utils.utcnow() + datetime.timedelta(minutes=1)
            blocks = "#" * math.floor(_percent / 10)
            bar = f"{blocks}{'.' * (10 - len(blocks))}"
            line = (f"{_percent}% [{bar}] | {_speed_megabits_per_second}Mbps | "
                    f"ETA {discord.utils.format_dt(_eta, 'R')}")
            nonlocal last_edit
            if (n - last_edit) >= 1.1:
                embed.clear_fields()
                embed.add_field(name="Progress", value=line)
                ctx.bot.loop.create_task(ctx.edit(embed=embed))
                last_edit = time.time()
        options["progress_hooks"] = [_download_hook]

        description = ""

        with tempfile.TemporaryDirectory(prefix="jimmy-ytdl-") as temp_dir:
            temp_dir = Path(temp_dir)
            paths = {
                target: str(temp_dir)
                for target in (
                    "home",
                    "temp",
                )
            }

            chosen_format = self.default_options["format"]
            if user_format:
                if user_format in self.common_formats:
                    chosen_format = self.common_formats[user_format]
                else:
                    chosen_format = user_format

            options.setdefault("postprocessors", [])
            if audio_only:
                # Overwrite format here to be best audio under 25 megabytes.
                chosen_format = "ba[filesize<20M]"
                # Also force sorting by the best audio bitrate first.
                options["format_sort"] = ["abr", "br"]
                # noinspection PyTypeChecker
                options["postprocessors"].append(
                    {"key": "FFmpegExtractAudio", "preferredquality": "96", "preferredcodec": "best"}
                )
            options["format"] = chosen_format
            options["paths"] = paths

            if subtitles:
                subtitles, burn = subtitles.split("+", 1) if "+" in subtitles else (subtitles, "0")
                burn = burn[0].lower() in ("y", "1", "t")
                if subtitles.lower() == "auto":
                    options["writeautosubtitles"] = True
                else:
                    options["writesubtitles"] = True
                    options["subtitleslangs"] = [subtitles]

                if burn:
                    # noinspection PyTypeChecker
                    options["postprocessors"].append(
                        {"key": "FFmpegEmbedSubtitle", "already_have_subtitle": True}
                    )

            with yt_dlp.YoutubeDL(options) as downloader:
                await ctx.respond(embed=discord.Embed().set_footer(text="Downloading (step 1/10)"))
                try:
                    # noinspection PyTypeChecker
                    extracted_info = await asyncio.to_thread(downloader.extract_info, url, download=False)
                except yt_dlp.utils.DownloadError as e:
                    extracted_info = {
                        "title": "error",
                        "thumbnail_url": None,
                        "webpage_url": url,
                        "format": "error",
                        "format_id": "-1",
                        "ext": "wav",
                        "format_note": str(e),
                        "resolution": "1x1",
                        "fps": "1",
                        "vcodec": "error",
                        "acodec": "error",
                        "filesize": 0,
                    }
                    title = "error"
                    description = str(e)
                    thumbnail_url = webpage_url = None
                    likes = views = 0
                    chosen_format_id = str(uuid.uuid4())
                else:
                    title = extracted_info.get("title", url) or url
                    title = textwrap.shorten(title, 100)
                    thumbnail_url = extracted_info.get("thumbnail") or None
                    webpage_url = extracted_info.get("webpage_url", url)

                    chosen_format = extracted_info.get("format") or chosen_format or str(uuid.uuid4())
                    chosen_format_id = extracted_info.get("format_id") or str(uuid.uuid4())
                    final_extension = extracted_info.get("ext") or "mp4"
                    format_note = extracted_info.get("format_note", "%s (%s)" % (chosen_format, chosen_format_id)) or ""
                    resolution = extracted_info.get("resolution") or "1x1"
                    fps = extracted_info.get("fps", 0.0) or 0.0
                    vcodec = extracted_info.get("vcodec") or "h264"
                    acodec = extracted_info.get("acodec") or "aac"
                    filesize = extracted_info.get("filesize", extracted_info.get("filesize_approx", 1))
                    likes = extracted_info.get("like_count", extracted_info.get("average_rating", 0))
                    views = extracted_info.get("view_count", 0)

                    lines = []
                    if chosen_format and chosen_format_id:
                        lines.append(
                            "* Chosen format: `%s` (`%s`)" % (chosen_format, chosen_format_id),
                        )
                    if format_note:
                        lines.append("* Format note: %r" % format_note)
                    if final_extension:
                        lines.append("* File extension: " + final_extension)
                    if resolution:
                        _s = resolution
                        if fps:
                            _s += " @ %s FPS" % fps
                        lines.append("* Resolution: " + _s)
                    if vcodec or acodec:
                        lines.append("%s+%s" % (vcodec or "N/A", acodec or "N/A"))
                    if filesize:
                        lines.append("* Filesize: %s" % yt_dlp.utils.format_bytes(filesize))

                    if lines:
                        description += "\n"
                        description += "\n".join(lines)

                domain = urlparse(webpage_url).netloc
                embed = discord.Embed(
                    title=title,
                    description=description,
                    url=webpage_url,
                    colour=self.colours.get(domain, discord.Colour.og_blurple()),
                )
                embed.add_field(
                    name="Progress",
                    value="0% [..........]"
                )
                embed.set_footer(text="Downloading (step 2/10)")
                embed.set_thumbnail(url=thumbnail_url)

                class StopView(discord.ui.View):
                    @discord.ui.button(label="Cancel download", style=discord.ButtonStyle.danger)
                    async def _stop(self, button: discord.ui.Button, interaction: discord.Interaction):
                        stop.set()
                        button.label = "Cancelling..."
                        button.disabled = True
                        await interaction.response.edit_message(view=self)
                        self.stop()

                await ctx.edit(
                    embed=embed,
                    view=StopView(timeout=86400)
                )
                previous = await self.get_saved(webpage_url, chosen_format_id, snip or "*")
                if previous:
                    await ctx.edit(
                        content=previous,
                        embed=discord.Embed(
                            title=f"Downloaded {title}!",
                            description="Used previously downloaded attachment.",
                            colour=discord.Colour.green(),
                            timestamp=discord.utils.utcnow(),
                            url=previous,
                            fields=[discord.EmbedField(name="URL", value=previous, inline=False)],
                        ).set_image(url=previous),
                    )
                    return

                last_edit = time.time()

                try:
                    await asyncio.to_thread(functools.partial(downloader.download, [url]))
                except yt_dlp.DownloadError as e:
                    logging.error(e, exc_info=True)
                    return await ctx.edit(
                        embed=discord.Embed(
                            title="Error",
                            description=f"Download failed:\n```\n{e}\n```",
                            colour=discord.Colour.red(),
                            url=webpage_url,
                        ),
                        delete_after=120,
                        view=None
                    )
                except RuntimeError:
                    return await ctx.edit(
                        embed=discord.Embed(
                            title="Error",
                            description="Download was cancelled.",
                            colour=discord.Colour.red(),
                            url=webpage_url,
                        ),
                        delete_after=120,
                        view=None
                    )
                await ctx.edit(view=None)
                try:
                    if audio_only is False:
                        file: Path = next(temp_dir.glob("*." + extracted_info["ext"]))
                    else:
                        # can be .opus, .m4a, .mp3, .ogg, .oga
                        for _file in temp_dir.iterdir():
                            if _file.suffix in (".opus", ".m4a", ".mp3", ".ogg", ".oga", ".aac", ".wav"):
                                file: Path = _file
                                break
                        else:
                            raise StopIteration
                except StopIteration:
                    ext = extracted_info["ext"]
                    self.log.warning(
                        "Failed to locate downloaded file. Was supposed to be looking for a file extension of "
                        "%r amongst files %r, however none were found.",
                        ext,
                        list(map(str, temp_dir.iterdir())),
                    )
                    return await ctx.edit(
                        embed=discord.Embed(
                            title="Error",
                            description="Failed to locate downloaded video file."
                            f" Was expecting a file with the extension {ext}.\n"
                            f"Files: {', '.join(list(map(str, temp_dir.iterdir())))}",
                            colour=discord.Colour.red(),
                            url=webpage_url,
                        )
                    )

                if snip:
                    try:
                        trim_start, trim_end = snip.split("-")
                    except ValueError:
                        trim_start, trim_end = snip, None
                    trim_start = trim_start or "00:00:00"
                    trim_end = trim_end or extracted_info.get("duration_string", "00:30:00")
                    new_file = temp_dir / ("output" + file.suffix)
                    args = [
                        "-hwaccel",
                        "auto",
                        "-i",
                        str(file),
                        "-ss",
                        trim_start,
                        "-to",
                        trim_end,
                        "-preset",
                        "fast",
                        "-crf",
                        "24",
                        "-deadline",
                        "realtime",
                        "-cpu-used",
                        "5",
                        "-movflags",
                        "faststart",
                        "-b:a",
                        "96k",
                        "-y",
                        "-strict",
                        "2",
                        str(new_file),
                    ]
                    async with ctx.channel.typing():
                        await ctx.edit(
                            embed=discord.Embed(
                                title=f"Trimming from {trim_start} to {trim_end}.",
                                description="Please wait, this may take a couple of minutes.",
                                colour=discord.Colour.og_blurple(),
                                timestamp=discord.utils.utcnow(),
                            )
                        )
                        self.log.debug("Running command: 'ffmpeg %s'", " ".join(args))
                        process = await asyncio.create_subprocess_exec(
                            "ffmpeg", *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
                        )
                        stdout, stderr = await process.communicate()
                        self.log.debug("STDOUT:\n%r", stdout.decode())
                        self.log.debug("STDERR:\n%r", stderr.decode())
                        if process.returncode != 0:
                            return await ctx.edit(
                                embed=discord.Embed(
                                    title="Error",
                                    description=f"Trimming failed:\n```\n{stderr.decode()}\n```",
                                    colour=discord.Colour.red(),
                                    url=webpage_url,
                                )
                            )
                        file = new_file

                if audio_only and file.suffix != ".m4a":
                    self.log.info("Converting %r to m4a.", file)
                    file: Path = await asyncio.to_thread(self.convert_to_m4a, file)

                stat = file.stat()
                size_bytes = stat.st_size
                if size_bytes >= ((500 * 1024 * 1024) - 256):
                    return await ctx.edit(
                        embed=discord.Embed(
                            title="Error",
                            description=f"File is too large to upload ({round(size_bytes / 1024 / 1024)}MB).",
                            colour=discord.Colour.red(),
                            url=webpage_url,
                        )
                    )

                size_megabits = (size_bytes * 8) / 1024 / 1024
                eta_seconds = size_megabits / 20
                await ctx.edit(
                    embed=discord.Embed(
                        title="Uploading...",
                        description=f"ETA <t:{int(eta_seconds + discord.utils.utcnow().timestamp()) + 2}:R>",
                        colour=discord.Colour.og_blurple(),
                        timestamp=discord.utils.utcnow(),
                    )
                )
                embed = discord.Embed(
                    title=f"Downloaded {title}!",
                    description="Views: {:,} | Likes: {:,}".format(views or 0, likes or 0),
                    colour=discord.Colour.green(),
                    timestamp=discord.utils.utcnow(),
                    url=webpage_url,
                )
                try:
                    if size_bytes >= (20 * 1024 * 1024) or vcodec.lower() in ["hevc", "h265", "av1", "av01"]:
                        with file.open("rb") as fb:
                            part = await self.upload_to_0x0(
                                file.name,
                                fb
                            )
                        embed.add_field(name="URL", value=f"https://0x0.st/{part}", inline=False)
                        await ctx.edit(
                            embed=embed
                        )
                        await ctx.respond("https://embeds.video/0x0/" + part)
                    else:
                        upload_file = await asyncio.to_thread(discord.File, file, filename=file.name)
                        msg = await ctx.edit(
                            file=upload_file,
                            embed=embed
                        )
                        await self.save_link(msg, webpage_url, chosen_format_id, snip=snip or "*")
                except (discord.HTTPException, ConnectionError, httpx.HTTPStatusError) as e:
                    self.log.error(e, exc_info=True)
                    return await ctx.edit(
                        embed=discord.Embed(
                            title="Error",
                            description=f"Upload failed:\n```\n{e}\n```",
                            colour=discord.Colour.red(),
                            url=webpage_url,
                        )
                    )


def setup(bot):
    bot.add_cog(YTDLCog(bot))