Act cache should be properly configured now

Fix TypeError
Fix ollama APi endpoint
2024-06-17 00:53:14 +01:00 · 2024-06-16 16:15:01 +01:00 · 2024-06-16 16:10:50 +01:00 · 2024-06-16 15:53:43 +01:00 · 2024-06-11 01:56:03 +01:00 · 2024-06-11 01:44:34 +01:00
5 changed files with 324 additions and 29 deletions
--- a/.gitea/workflows/docker.yml
+++ b/.gitea/workflows/docker.yml
@ -16,10 +16,7 @@ jobs:
        id: meta
        uses: docker/metadata-action@v5
        with:
-          images: |
+          images: git.i-am.nexus/nex/sentient-jimmy
            git.i-am.nexus/nex/sentient-jimmy
          tags: |
            type=sha
      - name: Log into forgejo CR
        uses: docker/login-action@v3
@ -36,4 +33,4 @@ jobs:
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          cache-to: type=gha,mode=max
--- a/README.md
+++ b/README.md
@ -8,6 +8,14 @@ Another Ollama bot for discord, however designed for mesh self-hosting.
 [bot]
 token = "your-bot-token"
 debug_guilds = [0123456789]  # omit for global commands
 db_url = "sqlite://:memory:"
 # ^ The database URL. Overridden by $DATABASE_URL.
 # The default in a docker environment is IN MEMORY, i.e. `sqlite://:memory:`.
 # The default in a non-docker environment is sqlite://default.db, aka sqlite @ ./default.db
 # If $DATABASE_URL is set, it will override this setting.
 # You can use SQLite, or PostgreSQL.
 # You can choose to build jimmy with mysql/mssql&oracle support by changing extra `asyncpg` to `asyncmy`/`asyncodbc`
 # in the tortoise-orm requirement in requrements.txt
 [ollama]
 order = ["server1", "server2", "fallback"]
@ -17,14 +25,23 @@ order = ["server1", "server2", "fallback"]
 base_url = "https://hosted.ollama.internal"  # default port is 443, because HTTPS
 gpu = true
 vram_gb = 8
 default_model="llama3:latest"  # sets the default model for /ollama chat
 [ollama.server2]
 base_url = "http://192.168.1.2:11434"
 gpu = true
 vram_gb = 4  # <8GB will enable "low VRAM mode" in ollama
 default_model = "llama2:latest"
 [ollama.fallback]
 base_url = "http://192.168.1.250:11434"
 gpu = false
 vram_gb = 32  # in the case of CPU Ollama, "vram" is actually just regular RAM.
 default_model = "orca-mini:3b"
 ```
 ## Running
 See [the example docker-compose.yml](/docker-compose.yml) for an example of how to run this bot with docker-compose.
 Alternatively, you can just run the docker image: `git.i-am.nexus/nex/sentient-jimmy:master`.
--- a/jimmy/cogs/chat.py
+++ b/jimmy/cogs/chat.py
@ -1,4 +1,5 @@
 import asyncio
 import datetime
 import io
 import logging
 import time
@ -6,11 +7,12 @@ import typing
 import contextlib
 import discord
 import httpx
 from discord import Interaction
 from ollama import AsyncClient, ResponseError, Options
 from discord.ext import commands
 from jimmy.utils import create_ollama_message, find_suitable_server, decorate_server_name as decorate_name
-from jimmy.config import get_servers, get_server
+from jimmy.config import get_servers, get_server, get_config
 from jimmy.db import OllamaThread
 from humanize import naturalsize, naturaldelta
@ -46,10 +48,13 @@ async def get_available_tags_autocomplete(ctx: discord.AutocompleteContext):
    chosen_server = get_server(ctx.options.get("server") or get_servers()[0].name)
    async with ollama_client(str(chosen_server.base_url), timeout=2) as client:
        tags = (await client.list())["models"]
    return [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
    v = [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
    return [ctx.value, *v][:25]
-_ServerOptionChoices = [discord.OptionChoice(server.name, server.name) for server in get_servers()]
+_ServerOptionAutocomplete = discord.utils.basic_autocomplete(
    [x.name for x in get_servers()]
 )
 class Chat(commands.Cog):
@ -60,7 +65,13 @@ class Chat(commands.Cog):
            self.server_locks[server.name] = asyncio.Lock()
        self.log = logging.getLogger(__name__)
-    @commands.slash_command()
+    ollama_group = discord.SlashCommandGroup(
        name="ollama",
        description="Commands related to ollama.",
        guild_only=True
    )
    @ollama_group.command()
    async def status(self, ctx: discord.ApplicationContext):
        """Checks the status on all servers."""
        await ctx.defer()
@ -71,10 +82,10 @@ class Chat(commands.Cog):
        )
        fields = {}
        for server in get_servers():
-            if server.throttle and self.server_locks[server.name].locked():
+            if self.server_locks[server.name].locked():
                embed.add_field(
                    name=decorate_name(server),
-                    value=f"\N{closed lock with key} In use.",
+                    value="\N{closed lock with key} In use.",
                    inline=False
                )
                fields[server] = len(embed.fields) - 1
@ -82,7 +93,7 @@ class Chat(commands.Cog):
            else:
                embed.add_field(
                    name=decorate_name(server),
-                    value=f"\N{hourglass with flowing sand} Waiting...",
+                    value="\N{hourglass with flowing sand} Waiting...",
                    inline=False
                )
                fields[server] = len(embed.fields) - 1
@ -90,7 +101,7 @@ class Chat(commands.Cog):
        await ctx.respond(embed=embed)
        tasks = {}
        for server in get_servers():
-            if server.throttle and self.server_locks[server.name].locked():
+            if self.server_locks[server.name].locked():
                continue
            tasks[server] = asyncio.create_task(server.is_online())
@ -100,19 +111,52 @@ class Chat(commands.Cog):
                embed.set_field_at(
                    fields[server],
                    name=decorate_name(server),
-                    value=f"\N{white heavy check mark} Online.",
+                    value="\N{white heavy check mark} Online.",
                    inline=False
                )
            else:
                embed.set_field_at(
                    fields[server],
                    name=decorate_name(server),
-                    value=f"\N{cross mark} Offline.",
+                    value="\N{cross mark} Offline.",
                    inline=False
                )
        await ctx.edit(embed=embed)
-    @commands.slash_command(name="ollama")
+    @ollama_group.command(name="server-info")
    async def get_server_info(
            self,
            ctx: discord.ApplicationContext,
            server: typing.Annotated[
                str,
                discord.Option(
                    discord.SlashCommandOptionType.string,
                    description="The server to use.",
                    autocomplete=_ServerOptionAutocomplete,
                    default=get_servers()[0].name
                )
            ]
    ):
        """Gets information on a given server"""
        await ctx.defer()
        server = get_server(server)
        is_online = await server.is_online()
        y = "\N{white heavy check mark}"
        x = "\N{cross mark}"
        t = {True: y, False: x}
        rt = "VRAM" if server.gpu else "RAM"
        lines = [
            f"Name: {server.name!r}",
            f"Base URL: {server.base_url!r}",
            f"GPU Enabled: {t[server.gpu]}",
            f"{rt}: {server.vram_gb:,} GB",
            f"Default Model: {server.default_model!r}",
            f"Is Online: {t[is_online]}"
        ]
        p = "```md\n" + "\n".join(lines) + "```"
        return await ctx.respond(p)
    @ollama_group.command(name="chat")
    async def start_ollama_chat(
            self,
            ctx: discord.ApplicationContext,
@ -130,7 +174,7 @@ class Chat(commands.Cog):
                discord.Option(
                    discord.SlashCommandOptionType.string,
                    description="The server to use.",
-                    choices=_ServerOptionChoices,
+                    autocomplete=_ServerOptionAutocomplete,
                    default=get_servers()[0].name
                )
            ],
@ -140,7 +184,7 @@ class Chat(commands.Cog):
                    discord.SlashCommandOptionType.string,
                    description="The model to use.",
                    autocomplete=get_available_tags_autocomplete,
-                    default="llama3:latest"
+                    default="default"
                )
            ],
            image: typing.Annotated[
@ -173,7 +217,9 @@ class Chat(commands.Cog):
        """Have a chat with ollama"""
        await ctx.defer()
        server = get_server(server)
-        if not await server.is_online():
+        if not server:
            return await ctx.respond("\N{cross mark} Unknown Server.")
        elif not await server.is_online():
            await ctx.respond(
                content=f"{server} is offline. Finding a suitable server...",
            )
@ -183,14 +229,17 @@ class Chat(commands.Cog):
                return await ctx.edit(content=str(err), delete_after=30)
            await ctx.delete(delay=5)
        async with self.server_locks[server.name]:
            if model == "default":
                model = server.default_model
            async with ollama_client(str(server.base_url)) as client:
                client: AsyncClient
                self.log.info("Checking if %r has the model %r", server, model)
                tags = (await client.list())["models"]
                # Download code. It's recommended to collapse this in the editor.
                if model not in [x["model"] for x in tags]:
                    embed = discord.Embed(
                        title=f"Downloading {model} on {server}.",
-                        description=f"Initiating download...",
+                        description="Initiating download...",
                        color=discord.Color.blurple()
                    )
                    view = StopDownloadView(ctx)
@ -265,6 +314,7 @@ class Chat(commands.Cog):
                            await ctx.edit(embed=embed, delete_after=30, view=None)
                messages = []
                thread = None
                if thread_id:
                    thread = await OllamaThread.get_or_none(thread_id=thread_id)
                    if thread:
@ -272,8 +322,29 @@ class Chat(commands.Cog):
                            messages.append(
                                await create_ollama_message(msg["content"], role=msg["role"])
                            )
                    elif len(thread_id) == 6:
                        # Is a legacy thread
                        _cfg = get_config()["truth_api"]
                        async with httpx.AsyncClient(
                            base_url=_cfg["url"],
                            auth=(_cfg["username"], _cfg["password"])
                        ) as http_client:
                            response = await http_client.get(f"/ollama/thread/threads:{thread_id}")
                            if response.status_code == 200:
                                thread = response.json()
                                messages = thread["messages"]
                                thread = OllamaThread(
                                    messages=[{"role": m["role"], "content": m["content"]} for m in messages],
                                )
                                await thread.save()
                            else:
                                return await ctx.respond(
                                    content="Failed to fetch legacy ollama thread from jimmy v2: HTTP %d (`%r`)" % (
                                        response.status_code, response.text
                                    ),
                                )
                    else:
-                        await ctx.respond(content="No thread with that ID exists.", delete_after=30)
+                        return await ctx.respond(content="No thread with that ID exists.", delete_after=30)
                if system_prompt:
                    messages.append(await create_ollama_message(system_prompt, role="system"))
                messages.append(await create_ollama_message(prompt, images=[await image.read()] if image else None))
@ -325,18 +396,187 @@ class Chat(commands.Cog):
                    embed.add_field(
                        name="Full chat",
                        value="The chat was too long to fit in this message. "
-                              f"You can download the `full-chat.txt` file to see the full message."
+                              "You can download the `full-chat.txt` file to see the full message."
                    )
                else:
                    file = discord.utils.MISSING
-                thread = OllamaThread(
+                if not thread:
-                    messages=[{"role": m["role"], "content": m["content"]} for m in messages],
+                    thread = OllamaThread(
-                )
+                        messages=[{"role": m["role"], "content": m["content"]} for m in messages],
-                await thread.save()
+                    )
                    await thread.save()
                embed.set_footer(text=f"Chat ID: {thread.thread_id}")
                await msg.edit(embed=embed, view=None, file=file)
    @ollama_group.command(name="pull")
    async def pull_ollama_model(
            self,
            ctx: discord.ApplicationContext,
            server: typing.Annotated[
                str,
                discord.Option(
                    discord.SlashCommandOptionType.string,
                    description="The server to use.",
                    autocomplete=_ServerOptionAutocomplete,
                    default=get_servers()[0].name
                )
            ],
            model: typing.Annotated[
                str,
                discord.Option(
                    discord.SlashCommandOptionType.string,
                    description="The model to use.",
                    autocomplete=get_available_tags_autocomplete,
                    default="llama3:latest"
                )
            ],
    ):
        """Downloads a tag on the target server"""
        await ctx.defer()
        server = get_server(server)
        if not server:
            return await ctx.respond("\N{cross mark} Unknown server.")
        elif not await server.is_online():
            return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
        embed = discord.Embed(
            title=f"Downloading {model} on {server}.",
            description="Initiating download...",
            color=discord.Color.blurple()
        )
        view = StopDownloadView(ctx)
        await ctx.respond(
            embed=embed,
            view=view
        )
        last_edit = 0
        async with ctx.typing():
            try:
                last_completed = 0
                last_completed_ts = time.time()
                async with ollama_client(str(server.base_url)) as client:
                    async for line in await client.pull(model, stream=True):
                        if view.event.is_set():
                            embed.add_field(name="Error!", value="Download cancelled.")
                            embed.colour = discord.Colour.red()
                            await ctx.edit(embed=embed)
                            return
                        self.log.debug("Response from %r: %r", server, line)
                        if line["status"] in {
                            "pulling manifest",
                            "verifying sha256 digest",
                            "writing manifest",
                            "removing any unused layers",
                            "success"
                        }:
                            embed.description = line["status"].capitalize()
                        else:
                            total = line["total"]
                            completed = line.get("completed", 0)
                            percent = round(completed / total * 100, 1)
                            pb_fill = "▰" * int(percent / 10)
                            pb_empty = "▱" * (10 - int(percent / 10))
                            bytes_per_second = completed - last_completed
                            bytes_per_second /= (time.time() - last_completed_ts)
                            last_completed = completed
                            last_completed_ts = time.time()
                            mbps = round((bytes_per_second * 8) / 1024 / 1024)
                            eta = (total - completed) / max(1, bytes_per_second)
                            progress_bar = f"[{pb_fill}{pb_empty}]"
                            ns_total = naturalsize(total, binary=True)
                            ns_completed = naturalsize(completed, binary=True)
                            embed.description = (
                                f"{line['status'].capitalize()} {percent}% {progress_bar} "
                                f"({ns_completed}/{ns_total} @ {mbps} Mb/s) "
                                f"[ETA: {naturaldelta(eta)}]"
                            )
                        if time.time() - last_edit >= 2.5:
                            await ctx.edit(embed=embed)
                            last_edit = time.time()
            except ResponseError as err:
                if err.error.endswith("file does not exist"):
                    await ctx.edit(
                        embed=None,
                        content="The model %r does not exist." % model,
                        delete_after=60,
                        view=None
                    )
                else:
                    embed.add_field(
                        name="Error!",
                        value=err.error
                    )
                    embed.colour = discord.Colour.red()
                    await ctx.edit(embed=embed, view=None)
                return
            else:
                embed.colour = discord.Colour.green()
                embed.description = f"Downloaded {model} on {server}."
                await ctx.edit(embed=embed, delete_after=30, view=None)
    @ollama_group.command(name="ps")
    async def ollama_proc_list(
            self,
            ctx: discord.ApplicationContext,
            server: typing.Annotated[
                str,
                discord.Option(
                    discord.SlashCommandOptionType.string,
                    description="The server to use.",
                    autocomplete=_ServerOptionAutocomplete,
                    default=get_servers()[0].name
                )
            ]
    ):
        """Checks the loaded models on the target server"""
        await ctx.defer()
        server = get_server(server)
        if not server:
            return await ctx.respond("\N{cross mark} Unknown server.")
        elif not await server.is_online():
            return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
        async with ollama_client(str(server.base_url)) as client:
            response = (await client.ps())["models"]
        if not response:
            embed = discord.Embed(
                title=f"No models loaded on {server}.",
                color=discord.Color.blurple()
            )
            return await ctx.respond(embed=embed)
        embed = discord.Embed(
            title=f"Models loaded on {server}",
            color=discord.Color.blurple()
        )
        for model in response[:25]:
            size = naturalsize(model["size"], binary=True)
            size_vram = naturalsize(model["size_vram"], binary=True)
            size_ram = naturalsize(model["size"] - model["size_vram"], binary=True)
            percent_in_vram = round(model["size_vram"] / model["size"] * 100)
            percent_in_ram = 100 - percent_in_vram
            expires = datetime.datetime.fromisoformat(model["expires_at"])
            lines = [
                f"* Size: {size}",
                f"* Unloaded: {discord.utils.format_dt(expires, style='R')}",
            ]
            if percent_in_ram > 0:
                lines.extend(
                    [
                        f"* VRAM/RAM: {percent_in_vram}%/{percent_in_ram}%",
                        f"* VRAM Size: {size_vram}",
                        f"* RAM Size: {size_ram}"
                    ]
                )
            else:
                lines.append(f"* VRAM Size: {size_vram} (100%)")
            embed.add_field(
                name=model["model"],
                value="\n".join(lines),
                inline=False
            )
        await ctx.respond(embed=embed)
 def setup(bot):
    bot.add_cog(Chat(bot))
--- a/jimmy/config.py
+++ b/jimmy/config.py
@ -1,6 +1,7 @@
 import os
 import tomllib
 import logging
 import urllib.parse
 from typing import Callable
 import httpx
@ -10,11 +11,11 @@ log = logging.getLogger(__name__)
 class ServerConfig(BaseModel):
-    name: str = Field(min_length=1, max_length=32)
+    name: str = Field(min_length=1, max_length=4096)
    base_url: AnyHttpUrl
    gpu: bool = False
    vram_gb: int = 4
-    throttle: bool = False
+    default_model: str = "llama3:latest"
    def __repr__(self):
        return "<ServerConfig name={0.name} base_url={0.base_url} gpu={0.gpu!s} vram_gb={0.vram_gb}>".format(self)
@ -26,7 +27,7 @@ class ServerConfig(BaseModel):
        """
        Checks that the current server is online and responding to requests.
        """
-        async with httpx.AsyncClient(base_url=str(self.base_url)) as client:
+        async with httpx.AsyncClient(base_url=str(self.base_url), timeout=httpx.Timeout(2.25)) as client:
            try:
                response = await client.get("/api/tags")
                return response.status_code == 200
@ -57,6 +58,40 @@ def get_server(name_or_base_url: str) -> ServerConfig | None:
    for server in servers:
        if server.name == name_or_base_url or server.base_url == name_or_base_url:
            return server
    try:
        parsed = urllib.parse.urlparse(name_or_base_url)
    except ValueError:
        pass
    else:
        if parsed.netloc and parsed.scheme in ["http", "https"]:
            defaults = {
                "name": ":temporary:-:%s:" % parsed.hostname,
                "base_url": "{0.scheme}://{0.netloc}".format(parsed),
                "gpu": False,
                "vram_gb": 2,
                "default_model": "orca-mini:3b"
            }
            if parsed.path and parsed.path.endswith(("/api", "/api/")):
                defaults["base_url"] += parsed.path
            parsed_qs = urllib.parse.parse_qs(parsed.query)
            for key, values in parsed_qs.items():
                if not values:
                    continue
                if key == "gpu":
                    values = [
                        values[0][0].lower() in ("t", "1", "y")
                    ]
                elif key == "vram_gb":
                    try:
                        values = [
                            int(values[0])
                        ]
                    except ValueError:
                        values = []
                if values:
                    defaults[key] = values[0]
            return ServerConfig(**defaults)
    return None
@ -67,6 +102,10 @@ def get_config():
    _loaded.setdefault("servers", {})
    _loaded["servers"].setdefault("order", [])
    _loaded.setdefault("bot", {})
    _loaded.setdefault("truth_api", {})
    _loaded["truth_api"].setdefault("url", "https://bots.nexy7574.co.uk/jimmy/v2/api")
    _loaded["truth_api"].setdefault("username", "invalid")
    _loaded["truth_api"].setdefault("password", "invalid")
    if database_url := os.getenv("DATABASE_URL"):
        _loaded["bot"]["db_url"] = database_url
    return _loaded
--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,2 @@
 [flake8]
 max-line-length = 120
Author	SHA1	Message	Date
nex	9a42ba815a	Act cache should be properly configured now Some checks failed Build and Publish / build_and_publish (push) Failing after 2m39s Details	2024-06-17 00:53:14 +01:00
nexy7574	597ffd386c	Fix TypeError All checks were successful Build and Publish / build_and_publish (push) Successful in 49s Details	2024-06-16 16:15:01 +01:00
nexy7574	290d5c9ccb	Fix ollama APi endpoint All checks were successful Build and Publish / build_and_publish (push) Successful in 51s Details	2024-06-16 16:10:50 +01:00
nexy7574	448a23affa	Allow importing legacy threads All checks were successful Build and Publish / build_and_publish (push) Successful in 58s Details	2024-06-16 15:53:43 +01:00
nexy7574	d203376850	Update the README All checks were successful Build and Publish / build_and_publish (push) Successful in 1m49s Details	2024-06-11 01:56:03 +01:00
nexy7574	76d3684449	Remove reference to throttle All checks were successful Build and Publish / build_and_publish (push) Successful in 45s Details	2024-06-11 01:44:34 +01:00
nexy7574	d4d550d7ba	Add a proper timeout to is_online Some checks failed Build and Publish / build_and_publish (push) Has been cancelled Details	2024-06-11 01:44:09 +01:00
nexy7574	b6d747a63b	Don't follow up with an empty embed All checks were successful Build and Publish / build_and_publish (push) Successful in 44s Details	2024-06-11 01:41:20 +01:00
nexy7574	e32d866ad4	add PS command All checks were successful Build and Publish / build_and_publish (push) Successful in 48s Details	2024-06-11 01:37:20 +01:00
nexy7574	3c61504cb3	Fix /ollama pull	2024-06-11 01:21:34 +01:00
nexy7574	af11baeeaa	Clarify on-the-fly server names All checks were successful Build and Publish / build_and_publish (push) Successful in 45s Details	2024-06-11 01:15:25 +01:00
nexy7574	954d01bca5	Add server info command All checks were successful Build and Publish / build_and_publish (push) Successful in 50s Details	2024-06-11 01:09:59 +01:00
nexy7574	c04e73dff9	Properly build master All checks were successful Build and Publish / build_and_publish (push) Successful in 44s Details	2024-06-11 01:03:51 +01:00
nexy7574	28908f217c	Enable ollama pull Some checks failed Build and Publish / build_and_publish (push) Failing after 1m53s Details	2024-06-11 00:58:17 +01:00
nexy7574	99001a60ba	Enable on-the-fly server construction	2024-06-11 00:53:48 +01:00