diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..58322dc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +config.toml +**/config.toml +*.db +*.db-* \ No newline at end of file diff --git a/.gitignore b/.gitignore index c5683a0..651f249 100644 --- a/.gitignore +++ b/.gitignore @@ -282,5 +282,6 @@ pyrightconfig.json # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,pycharm+all .venv/ -default.db +*.db +*.db-* config.toml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c5e724f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.12-alpine + +WORKDIR /jimmy +RUN apk add --update --no-cache py3-pip py3-setuptools py3-wheel +COPY requirements.txt . +RUN pip install -r requirements.txt +COPY ./jimmy/ /jimmy/ +CMD ["python3", "main.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..6f10518 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# Sentient Jimmy + +Another Ollama bot for discord, however designed for mesh self-hosting. + +## Example config.toml + +```toml +[bot] +token = "your-bot-token" +debug_guilds = [0123456789] # omit for global commands + +[ollama] +order = ["server1", "server2", "fallback"] +# ^ order of preference for Ollama servers. If server1 is offline, server2 will be tried, and so on + +[ollama.server1] +base_url = "https://hosted.ollama.internal" # default port is 443, because HTTPS +gpu = true +vram_gb = 8 + +[ollama.server2] +base_url = "http://192.168.1.2:11434" +gpu = true +vram_gb = 4 # <8GB will enable "low VRAM mode" in ollama + +[ollama.fallback] +base_url = "http://192.168.1.250:11434" +gpu = false +vram_gb = 32 # in the case of CPU Ollama, "vram" is actually just regular RAM. +``` diff --git a/jimmy/cogs/chat.py b/jimmy/cogs/chat.py index fd250b8..de660e0 100644 --- a/jimmy/cogs/chat.py +++ b/jimmy/cogs/chat.py @@ -4,16 +4,15 @@ import logging import time import typing import contextlib -from fnmatch import fnmatch import discord from discord import Interaction from ollama import AsyncClient, ResponseError, Options from discord.ext import commands -from jimmy.utils import async_ratio, create_ollama_message -from jimmy.config import get_servers, ServerConfig, get_server -from jimmy.db import OllamaThread -from humanize import naturalsize +from jimmy .utils import create_ollama_message, find_suitable_server, decorate_server_name as decorate_name +from jimmy .config import get_servers, get_server +from jimmy .db import OllamaThread +from humanize import naturalsize, naturaldelta @contextlib.asynccontextmanager @@ -66,11 +65,6 @@ class Chat(commands.Cog): """Checks the status on all servers.""" await ctx.defer() - def decorate_name(_s: ServerConfig): - if _s.gpu: - return f"{_s.name} (\u26A1)" - return _s.name - embed = discord.Embed( title="Ollama Statuses:", color=discord.Color.blurple() @@ -164,18 +158,31 @@ class Chat(commands.Cog): description="The thread ID to continue.", default=None ) + ], + temperature: typing.Annotated[ + float, + discord.Option( + discord.SlashCommandOptionType.number, + description="The temperature to use.", + default=1.5, + min_value=0.0, + max_value=2.0 + ) ] ): """Have a chat with ollama""" await ctx.defer() server = get_server(server) + if not await server.is_online(): + await ctx.respond( + content=f"{server} is offline. Finding a suitable server...", + ) + try: + server = await find_suitable_server() + except ValueError as err: + return await ctx.edit(content=str(err), delete_after=30) + await ctx.delete(delay=5) async with self.server_locks[server.name]: - if not await server.is_online(): - await ctx.respond( - content=f"{server} is offline.", - delete_after=60 - ) - return async with ollama_client(str(server.base_url)) as client: client: AsyncClient self.log.info("Checking if %r has the model %r", server, model) @@ -203,7 +210,7 @@ class Chat(commands.Cog): embed.colour = discord.Colour.red() await ctx.edit(embed=embed) return - self.log.info("Response from %r: %r", server, line) + self.log.debug("Response from %r: %r", server, line) if line["status"] in { "pulling manifest", "verifying sha256 digest", @@ -223,12 +230,14 @@ class Chat(commands.Cog): last_completed = completed last_completed_ts = time.time() mbps = round((bytes_per_second * 8) / 1024 / 1024) + eta = (total - completed) / max(1, bytes_per_second) progress_bar = f"[{pb_fill}{pb_empty}]" ns_total = naturalsize(total, binary=True) ns_completed = naturalsize(completed, binary=True) embed.description = ( f"{line['status'].capitalize()} {percent}% {progress_bar} " - f"({ns_completed}/{ns_total} @ {mbps} Mb/s)" + f"({ns_completed}/{ns_total} @ {mbps} Mb/s) " + f"[ETA: {naturaldelta(eta)}]" ) if time.time() - last_edit >= 2.5: @@ -268,7 +277,11 @@ class Chat(commands.Cog): if system_prompt: messages.append(await create_ollama_message(system_prompt, role="system")) messages.append(await create_ollama_message(prompt, images=[await image.read()] if image else None)) - embed = discord.Embed(title=f"{model}:", description="") + embed = discord.Embed(description="") + embed.set_author( + name=f"{model} @ {decorate_name(server)!r}" if server.gpu else model, + icon_url="https://ollama.com/public/icon-64x64.png" + ) view = StopDownloadView(ctx) msg = await ctx.respond( embed=embed, @@ -283,10 +296,11 @@ class Chat(commands.Cog): options=Options( num_ctx=4096, low_vram=server.vram_gb < 8, - temperature=1.5 + temperature=temperature ) ): - self.log.info("Response from %r: %r", server, response) + response: dict + self.log.debug("Response from %r: %r", server, response) buffer.write(response["message"]["content"]) if len(buffer.getvalue()) > 4096: diff --git a/jimmy/main.py b/jimmy/main.py index f26f1ca..2db1617 100644 --- a/jimmy/main.py +++ b/jimmy/main.py @@ -1,12 +1,11 @@ import os -import sys import logging import discord +import sys from discord.ext import commands from tortoise import Tortoise -sys.path.extend("..") # noqa: E402 -from .config import get_config - +from config import get_config +sys.path.extend([".", ".."]) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -24,7 +23,7 @@ class SentientJimmy(commands.Bot): strip_after_prefix=True, debug_guilds=get_config()["bot"].get("debug_guilds"), ) - self.load_extension("jimmy.cogs.chat") + self.load_extension("cogs.chat") self.load_extension("jishaku") async def start(self, token: str, *, reconnect: bool = True) -> None: diff --git a/jimmy/utils.py b/jimmy/utils.py index 782fe35..042dfaf 100644 --- a/jimmy/utils.py +++ b/jimmy/utils.py @@ -4,13 +4,24 @@ from functools import partial from fuzzywuzzy.fuzz import ratio from ollama import Message +if typing.TYPE_CHECKING: + from .config import ServerConfig + __all__ = ( 'async_ratio', 'create_ollama_message', + 'find_suitable_server', + 'decorate_server_name' ) +def decorate_server_name(_s: "ServerConfig") -> str: + if _s.gpu: + return f"{_s.name} (\u26A1)" + return _s.name + + async def async_ratio(a: str, b: str) -> int: """ Wraps fuzzywuzzy ratio in an async function @@ -45,3 +56,23 @@ async def create_ollama_message( images=images ) ) + + +async def find_suitable_server(cpu_fallback: bool = True) -> "ServerConfig": + """ + Finds a suitable server to use for Ollama. + + :param cpu_fallback: bool - whether to fall back to CPU servers if GPU servers are unavailable. + :return: ServerConfig - the server to use + """ + from .config import get_servers + servers = get_servers() + if not servers: + raise ValueError("No servers configured.") + for server in servers: + if cpu_fallback is False and server.gpu is False: + continue + if not await server.is_online(): + continue + return server + raise ValueError("No servers available.") diff --git a/requirements.txt b/requirements.txt index 2302399..eebfff5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,6 @@ py-cord~=2.5 ollama~=0.2 tortoise-orm[asyncpg]~=0.21 -uvicorn[standard]~=0.30 -fastapi~=0.111 jishaku~=2.5 fuzzywuzzy~=0.18 humanize~=4.9