Act cache should be properly configured now

Fix TypeError
Fix ollama APi endpoint
2024-06-17 00:53:14 +01:00 · 2024-06-16 16:15:01 +01:00 · 2024-06-16 16:10:50 +01:00 · 2024-06-16 15:53:43 +01:00 · 2024-06-11 01:56:03 +01:00 · 2024-06-11 01:44:34 +01:00
5 changed files with 324 additions and 29 deletions
--- a/.gitea/workflows/docker.yml
+++ b/.gitea/workflows/docker.yml
@ -16,10 +16,7 @@ jobs:
        id: meta
        uses: docker/metadata-action@v5
        with:
-          images: |
-            git.i-am.nexus/nex/sentient-jimmy
-          tags: |
-            type=sha
+          images: git.i-am.nexus/nex/sentient-jimmy

      - name: Log into forgejo CR
        uses: docker/login-action@v3
--- a/README.md
+++ b/README.md
@ -8,6 +8,14 @@ Another Ollama bot for discord, however designed for mesh self-hosting.
 [bot]
 token = "your-bot-token"
 debug_guilds = [0123456789]  # omit for global commands
+db_url = "sqlite://:memory:"
+# ^ The database URL. Overridden by $DATABASE_URL.
+# The default in a docker environment is IN MEMORY, i.e. `sqlite://:memory:`.
+# The default in a non-docker environment is sqlite://default.db, aka sqlite @ ./default.db
+# If $DATABASE_URL is set, it will override this setting.
+# You can use SQLite, or PostgreSQL.
+# You can choose to build jimmy with mysql/mssql&oracle support by changing extra `asyncpg` to `asyncmy`/`asyncodbc`
+# in the tortoise-orm requirement in requrements.txt

 [ollama]
 order = ["server1", "server2", "fallback"]
@ -17,14 +25,23 @@ order = ["server1", "server2", "fallback"]
 base_url = "https://hosted.ollama.internal"  # default port is 443, because HTTPS
 gpu = true
 vram_gb = 8
+default_model="llama3:latest"  # sets the default model for /ollama chat

 [ollama.server2]
 base_url = "http://192.168.1.2:11434"
 gpu = true
 vram_gb = 4  # <8GB will enable "low VRAM mode" in ollama
+default_model = "llama2:latest"

 [ollama.fallback]
 base_url = "http://192.168.1.250:11434"
 gpu = false
 vram_gb = 32  # in the case of CPU Ollama, "vram" is actually just regular RAM.
+default_model = "orca-mini:3b"
 ```
+
+## Running
+
+See [the example docker-compose.yml](/docker-compose.yml) for an example of how to run this bot with docker-compose.
+
+Alternatively, you can just run the docker image: `git.i-am.nexus/nex/sentient-jimmy:master`.
--- a/jimmy/cogs/chat.py
+++ b/jimmy/cogs/chat.py
@ -1,4 +1,5 @@
 import asyncio
+import datetime
 import io
 import logging
 import time
@ -6,11 +7,12 @@ import typing
 import contextlib

 import discord
+import httpx
 from discord import Interaction
 from ollama import AsyncClient, ResponseError, Options
 from discord.ext import commands
 from jimmy.utils import create_ollama_message, find_suitable_server, decorate_server_name as decorate_name
-from jimmy.config import get_servers, get_server
+from jimmy.config import get_servers, get_server, get_config
 from jimmy.db import OllamaThread
 from humanize import naturalsize, naturaldelta

@ -46,10 +48,13 @@ async def get_available_tags_autocomplete(ctx: discord.AutocompleteContext):
    chosen_server = get_server(ctx.options.get("server") or get_servers()[0].name)
    async with ollama_client(str(chosen_server.base_url), timeout=2) as client:
        tags = (await client.list())["models"]
-    return [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]

+    v = [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
+    return [ctx.value, *v][:25]

-_ServerOptionChoices = [discord.OptionChoice(server.name, server.name) for server in get_servers()]
+_ServerOptionAutocomplete = discord.utils.basic_autocomplete(
+    [x.name for x in get_servers()]
+)


 class Chat(commands.Cog):
@ -60,7 +65,13 @@ class Chat(commands.Cog):
            self.server_locks[server.name] = asyncio.Lock()
        self.log = logging.getLogger(__name__)

-    @commands.slash_command()
+    ollama_group = discord.SlashCommandGroup(
+        name="ollama",
+        description="Commands related to ollama.",
+        guild_only=True
+    )
+
+    @ollama_group.command()
    async def status(self, ctx: discord.ApplicationContext):
        """Checks the status on all servers."""
        await ctx.defer()
@ -71,10 +82,10 @@ class Chat(commands.Cog):
        )
        fields = {}
        for server in get_servers():
-            if server.throttle and self.server_locks[server.name].locked():
+            if self.server_locks[server.name].locked():
                embed.add_field(
                    name=decorate_name(server),
-                    value=f"\N{closed lock with key} In use.",
+                    value="\N{closed lock with key} In use.",
                    inline=False
                )
                fields[server] = len(embed.fields) - 1
@ -82,7 +93,7 @@ class Chat(commands.Cog):
            else:
                embed.add_field(
                    name=decorate_name(server),
-                    value=f"\N{hourglass with flowing sand} Waiting...",
+                    value="\N{hourglass with flowing sand} Waiting...",
                    inline=False
                )
                fields[server] = len(embed.fields) - 1
@ -90,7 +101,7 @@ class Chat(commands.Cog):
        await ctx.respond(embed=embed)
        tasks = {}
        for server in get_servers():
-            if server.throttle and self.server_locks[server.name].locked():
+            if self.server_locks[server.name].locked():
                continue
            tasks[server] = asyncio.create_task(server.is_online())

@ -100,19 +111,52 @@ class Chat(commands.Cog):
                embed.set_field_at(
                    fields[server],
                    name=decorate_name(server),
-                    value=f"\N{white heavy check mark} Online.",
+                    value="\N{white heavy check mark} Online.",
                    inline=False
                )
            else:
                embed.set_field_at(
                    fields[server],
                    name=decorate_name(server),
-                    value=f"\N{cross mark} Offline.",
+                    value="\N{cross mark} Offline.",
                    inline=False
                )
        await ctx.edit(embed=embed)

-    @commands.slash_command(name="ollama")
+    @ollama_group.command(name="server-info")
+    async def get_server_info(
+            self,
+            ctx: discord.ApplicationContext,
+            server: typing.Annotated[
+                str,
+                discord.Option(
+                    discord.SlashCommandOptionType.string,
+                    description="The server to use.",
+                    autocomplete=_ServerOptionAutocomplete,
+                    default=get_servers()[0].name
+                )
+            ]
+    ):
+        """Gets information on a given server"""
+        await ctx.defer()
+        server = get_server(server)
+        is_online = await server.is_online()
+        y = "\N{white heavy check mark}"
+        x = "\N{cross mark}"
+        t = {True: y, False: x}
+        rt = "VRAM" if server.gpu else "RAM"
+        lines = [
+            f"Name: {server.name!r}",
+            f"Base URL: {server.base_url!r}",
+            f"GPU Enabled: {t[server.gpu]}",
+            f"{rt}: {server.vram_gb:,} GB",
+            f"Default Model: {server.default_model!r}",
+            f"Is Online: {t[is_online]}"
+        ]
+        p = "```md\n" + "\n".join(lines) + "```"
+        return await ctx.respond(p)
+
+    @ollama_group.command(name="chat")
    async def start_ollama_chat(
            self,
            ctx: discord.ApplicationContext,
@ -130,7 +174,7 @@ class Chat(commands.Cog):
                discord.Option(
                    discord.SlashCommandOptionType.string,
                    description="The server to use.",
-                    choices=_ServerOptionChoices,
+                    autocomplete=_ServerOptionAutocomplete,
                    default=get_servers()[0].name
                )
            ],
@ -140,7 +184,7 @@ class Chat(commands.Cog):
                    discord.SlashCommandOptionType.string,
                    description="The model to use.",
                    autocomplete=get_available_tags_autocomplete,
-                    default="llama3:latest"
+                    default="default"
                )
            ],
            image: typing.Annotated[
@ -173,7 +217,9 @@ class Chat(commands.Cog):
        """Have a chat with ollama"""
        await ctx.defer()
        server = get_server(server)
-        if not await server.is_online():
+        if not server:
+            return await ctx.respond("\N{cross mark} Unknown Server.")
+        elif not await server.is_online():
            await ctx.respond(
                content=f"{server} is offline. Finding a suitable server...",
            )
@ -183,14 +229,17 @@ class Chat(commands.Cog):
                return await ctx.edit(content=str(err), delete_after=30)
            await ctx.delete(delay=5)
        async with self.server_locks[server.name]:
+            if model == "default":
+                model = server.default_model
            async with ollama_client(str(server.base_url)) as client:
                client: AsyncClient
                self.log.info("Checking if %r has the model %r", server, model)
                tags = (await client.list())["models"]
+                # Download code. It's recommended to collapse this in the editor.
                if model not in [x["model"] for x in tags]:
                    embed = discord.Embed(
                        title=f"Downloading {model} on {server}.",
-                        description=f"Initiating download...",
+                        description="Initiating download...",
                        color=discord.Color.blurple()
                    )
                    view = StopDownloadView(ctx)
@ -265,6 +314,7 @@ class Chat(commands.Cog):
                            await ctx.edit(embed=embed, delete_after=30, view=None)

                messages = []
+                thread = None
                if thread_id:
                    thread = await OllamaThread.get_or_none(thread_id=thread_id)
                    if thread:
@ -272,8 +322,29 @@ class Chat(commands.Cog):
                            messages.append(
                                await create_ollama_message(msg["content"], role=msg["role"])
                            )
+                    elif len(thread_id) == 6:
+                        # Is a legacy thread
+                        _cfg = get_config()["truth_api"]
+                        async with httpx.AsyncClient(
+                            base_url=_cfg["url"],
+                            auth=(_cfg["username"], _cfg["password"])
+                        ) as http_client:
+                            response = await http_client.get(f"/ollama/thread/threads:{thread_id}")
+                            if response.status_code == 200:
+                                thread = response.json()
+                                messages = thread["messages"]
+                                thread = OllamaThread(
+                                    messages=[{"role": m["role"], "content": m["content"]} for m in messages],
+                                )
+                                await thread.save()
+                            else:
+                                return await ctx.respond(
+                                    content="Failed to fetch legacy ollama thread from jimmy v2: HTTP %d (`%r`)" % (
+                                        response.status_code, response.text
+                                    ),
+                                )
                    else:
-                        await ctx.respond(content="No thread with that ID exists.", delete_after=30)
+                        return await ctx.respond(content="No thread with that ID exists.", delete_after=30)
                if system_prompt:
                    messages.append(await create_ollama_message(system_prompt, role="system"))
                messages.append(await create_ollama_message(prompt, images=[await image.read()] if image else None))
@ -325,18 +396,187 @@ class Chat(commands.Cog):
                    embed.add_field(
                        name="Full chat",
                        value="The chat was too long to fit in this message. "
-                              f"You can download the `full-chat.txt` file to see the full message."
+                              "You can download the `full-chat.txt` file to see the full message."
                    )
                else:
                    file = discord.utils.MISSING

-                thread = OllamaThread(
-                    messages=[{"role": m["role"], "content": m["content"]} for m in messages],
-                )
-                await thread.save()
+                if not thread:
+                    thread = OllamaThread(
+                        messages=[{"role": m["role"], "content": m["content"]} for m in messages],
+                    )
+                    await thread.save()
                embed.set_footer(text=f"Chat ID: {thread.thread_id}")
                await msg.edit(embed=embed, view=None, file=file)

+    @ollama_group.command(name="pull")
+    async def pull_ollama_model(
+            self,
+            ctx: discord.ApplicationContext,
+            server: typing.Annotated[
+                str,
+                discord.Option(
+                    discord.SlashCommandOptionType.string,
+                    description="The server to use.",
+                    autocomplete=_ServerOptionAutocomplete,
+                    default=get_servers()[0].name
+                )
+            ],
+            model: typing.Annotated[
+                str,
+                discord.Option(
+                    discord.SlashCommandOptionType.string,
+                    description="The model to use.",
+                    autocomplete=get_available_tags_autocomplete,
+                    default="llama3:latest"
+                )
+            ],
+    ):
+        """Downloads a tag on the target server"""
+        await ctx.defer()
+        server = get_server(server)
+        if not server:
+            return await ctx.respond("\N{cross mark} Unknown server.")
+        elif not await server.is_online():
+            return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
+        embed = discord.Embed(
+            title=f"Downloading {model} on {server}.",
+            description="Initiating download...",
+            color=discord.Color.blurple()
+        )
+        view = StopDownloadView(ctx)
+        await ctx.respond(
+            embed=embed,
+            view=view
+        )
+        last_edit = 0
+        async with ctx.typing():
+            try:
+                last_completed = 0
+                last_completed_ts = time.time()
+
+                async with ollama_client(str(server.base_url)) as client:
+                    async for line in await client.pull(model, stream=True):
+                        if view.event.is_set():
+                            embed.add_field(name="Error!", value="Download cancelled.")
+                            embed.colour = discord.Colour.red()
+                            await ctx.edit(embed=embed)
+                            return
+                        self.log.debug("Response from %r: %r", server, line)
+                        if line["status"] in {
+                            "pulling manifest",
+                            "verifying sha256 digest",
+                            "writing manifest",
+                            "removing any unused layers",
+                            "success"
+                        }:
+                            embed.description = line["status"].capitalize()
+                        else:
+                            total = line["total"]
+                            completed = line.get("completed", 0)
+                            percent = round(completed / total * 100, 1)
+                            pb_fill = "▰" * int(percent / 10)
+                            pb_empty = "▱" * (10 - int(percent / 10))
+                            bytes_per_second = completed - last_completed
+                            bytes_per_second /= (time.time() - last_completed_ts)
+                            last_completed = completed
+                            last_completed_ts = time.time()
+                            mbps = round((bytes_per_second * 8) / 1024 / 1024)
+                            eta = (total - completed) / max(1, bytes_per_second)
+                            progress_bar = f"[{pb_fill}{pb_empty}]"
+                            ns_total = naturalsize(total, binary=True)
+                            ns_completed = naturalsize(completed, binary=True)
+                            embed.description = (
+                                f"{line['status'].capitalize()} {percent}% {progress_bar} "
+                                f"({ns_completed}/{ns_total} @ {mbps} Mb/s) "
+                                f"[ETA: {naturaldelta(eta)}]"
+                            )
+
+                        if time.time() - last_edit >= 2.5:
+                            await ctx.edit(embed=embed)
+                            last_edit = time.time()
+            except ResponseError as err:
+                if err.error.endswith("file does not exist"):
+                    await ctx.edit(
+                        embed=None,
+                        content="The model %r does not exist." % model,
+                        delete_after=60,
+                        view=None
+                    )
+                else:
+                    embed.add_field(
+                        name="Error!",
+                        value=err.error
+                    )
+                    embed.colour = discord.Colour.red()
+                    await ctx.edit(embed=embed, view=None)
+                return
+            else:
+                embed.colour = discord.Colour.green()
+                embed.description = f"Downloaded {model} on {server}."
+                await ctx.edit(embed=embed, delete_after=30, view=None)
+
+    @ollama_group.command(name="ps")
+    async def ollama_proc_list(
+            self,
+            ctx: discord.ApplicationContext,
+            server: typing.Annotated[
+                str,
+                discord.Option(
+                    discord.SlashCommandOptionType.string,
+                    description="The server to use.",
+                    autocomplete=_ServerOptionAutocomplete,
+                    default=get_servers()[0].name
+                )
+            ]
+    ):
+        """Checks the loaded models on the target server"""
+        await ctx.defer()
+        server = get_server(server)
+        if not server:
+            return await ctx.respond("\N{cross mark} Unknown server.")
+        elif not await server.is_online():
+            return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
+        async with ollama_client(str(server.base_url)) as client:
+            response = (await client.ps())["models"]
+        if not response:
+            embed = discord.Embed(
+                title=f"No models loaded on {server}.",
+                color=discord.Color.blurple()
+            )
+            return await ctx.respond(embed=embed)
+        embed = discord.Embed(
+            title=f"Models loaded on {server}",
+            color=discord.Color.blurple()
+        )
+        for model in response[:25]:
+            size = naturalsize(model["size"], binary=True)
+            size_vram = naturalsize(model["size_vram"], binary=True)
+            size_ram = naturalsize(model["size"] - model["size_vram"], binary=True)
+            percent_in_vram = round(model["size_vram"] / model["size"] * 100)
+            percent_in_ram = 100 - percent_in_vram
+            expires = datetime.datetime.fromisoformat(model["expires_at"])
+            lines = [
+                f"* Size: {size}",
+                f"* Unloaded: {discord.utils.format_dt(expires, style='R')}",
+            ]
+            if percent_in_ram > 0:
+                lines.extend(
+                    [
+                        f"* VRAM/RAM: {percent_in_vram}%/{percent_in_ram}%",
+                        f"* VRAM Size: {size_vram}",
+                        f"* RAM Size: {size_ram}"
+                    ]
+                )
+            else:
+                lines.append(f"* VRAM Size: {size_vram} (100%)")
+            embed.add_field(
+                name=model["model"],
+                value="\n".join(lines),
+                inline=False
+            )
+        await ctx.respond(embed=embed)
+

 def setup(bot):
    bot.add_cog(Chat(bot))
--- a/jimmy/config.py
+++ b/jimmy/config.py
@ -1,6 +1,7 @@
 import os
 import tomllib
 import logging
+import urllib.parse
 from typing import Callable

 import httpx
@ -10,11 +11,11 @@ log = logging.getLogger(__name__)


 class ServerConfig(BaseModel):
-    name: str = Field(min_length=1, max_length=32)
+    name: str = Field(min_length=1, max_length=4096)
    base_url: AnyHttpUrl
    gpu: bool = False
    vram_gb: int = 4
-    throttle: bool = False
+    default_model: str = "llama3:latest"

    def __repr__(self):
        return "<ServerConfig name={0.name} base_url={0.base_url} gpu={0.gpu!s} vram_gb={0.vram_gb}>".format(self)
@ -26,7 +27,7 @@ class ServerConfig(BaseModel):
        """
        Checks that the current server is online and responding to requests.
        """
-        async with httpx.AsyncClient(base_url=str(self.base_url)) as client:
+        async with httpx.AsyncClient(base_url=str(self.base_url), timeout=httpx.Timeout(2.25)) as client:
            try:
                response = await client.get("/api/tags")
                return response.status_code == 200
@ -57,6 +58,40 @@ def get_server(name_or_base_url: str) -> ServerConfig | None:
    for server in servers:
        if server.name == name_or_base_url or server.base_url == name_or_base_url:
            return server
+
+    try:
+        parsed = urllib.parse.urlparse(name_or_base_url)
+    except ValueError:
+        pass
+    else:
+        if parsed.netloc and parsed.scheme in ["http", "https"]:
+            defaults = {
+                "name": ":temporary:-:%s:" % parsed.hostname,
+                "base_url": "{0.scheme}://{0.netloc}".format(parsed),
+                "gpu": False,
+                "vram_gb": 2,
+                "default_model": "orca-mini:3b"
+            }
+            if parsed.path and parsed.path.endswith(("/api", "/api/")):
+                defaults["base_url"] += parsed.path
+            parsed_qs = urllib.parse.parse_qs(parsed.query)
+            for key, values in parsed_qs.items():
+                if not values:
+                    continue
+                if key == "gpu":
+                    values = [
+                        values[0][0].lower() in ("t", "1", "y")
+                    ]
+                elif key == "vram_gb":
+                    try:
+                        values = [
+                            int(values[0])
+                        ]
+                    except ValueError:
+                        values = []
+                if values:
+                    defaults[key] = values[0]
+            return ServerConfig(**defaults)
    return None


@ -67,6 +102,10 @@ def get_config():
    _loaded.setdefault("servers", {})
    _loaded["servers"].setdefault("order", [])
    _loaded.setdefault("bot", {})
+    _loaded.setdefault("truth_api", {})
+    _loaded["truth_api"].setdefault("url", "https://bots.nexy7574.co.uk/jimmy/v2/api")
+    _loaded["truth_api"].setdefault("username", "invalid")
+    _loaded["truth_api"].setdefault("password", "invalid")
    if database_url := os.getenv("DATABASE_URL"):
        _loaded["bot"]["db_url"] = database_url
    return _loaded
--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
Author	SHA1	Message	Date
nex	9a42ba815a	Act cache should be properly configured now Some checks failed Build and Publish / build_and_publish (push) Failing after 2m39s Details	2024-06-17 00:53:14 +01:00
nexy7574	597ffd386c	Fix TypeError All checks were successful Build and Publish / build_and_publish (push) Successful in 49s Details	2024-06-16 16:15:01 +01:00
nexy7574	290d5c9ccb	Fix ollama APi endpoint All checks were successful Build and Publish / build_and_publish (push) Successful in 51s Details	2024-06-16 16:10:50 +01:00
nexy7574	448a23affa	Allow importing legacy threads All checks were successful Build and Publish / build_and_publish (push) Successful in 58s Details	2024-06-16 15:53:43 +01:00
nexy7574	d203376850	Update the README All checks were successful Build and Publish / build_and_publish (push) Successful in 1m49s Details	2024-06-11 01:56:03 +01:00
nexy7574	76d3684449	Remove reference to throttle All checks were successful Build and Publish / build_and_publish (push) Successful in 45s Details	2024-06-11 01:44:34 +01:00
nexy7574	d4d550d7ba	Add a proper timeout to is_online Some checks failed Build and Publish / build_and_publish (push) Has been cancelled Details	2024-06-11 01:44:09 +01:00
nexy7574	b6d747a63b	Don't follow up with an empty embed All checks were successful Build and Publish / build_and_publish (push) Successful in 44s Details	2024-06-11 01:41:20 +01:00
nexy7574	e32d866ad4	add PS command All checks were successful Build and Publish / build_and_publish (push) Successful in 48s Details	2024-06-11 01:37:20 +01:00
nexy7574	3c61504cb3	Fix /ollama pull	2024-06-11 01:21:34 +01:00
nexy7574	af11baeeaa	Clarify on-the-fly server names All checks were successful Build and Publish / build_and_publish (push) Successful in 45s Details	2024-06-11 01:15:25 +01:00
nexy7574	954d01bca5	Add server info command All checks were successful Build and Publish / build_and_publish (push) Successful in 50s Details	2024-06-11 01:09:59 +01:00
nexy7574	c04e73dff9	Properly build master All checks were successful Build and Publish / build_and_publish (push) Successful in 44s Details	2024-06-11 01:03:51 +01:00
nexy7574	28908f217c	Enable ollama pull Some checks failed Build and Publish / build_and_publish (push) Failing after 1m53s Details	2024-06-11 00:58:17 +01:00
nexy7574	99001a60ba	Enable on-the-fly server construction	2024-06-11 00:53:48 +01:00