Compare commits

...

15 commits

Author SHA1 Message Date
nex
9a42ba815a Act cache should be properly configured now
Some checks failed
Build and Publish / build_and_publish (push) Failing after 2m39s
2024-06-17 00:53:14 +01:00
597ffd386c Fix TypeError
All checks were successful
Build and Publish / build_and_publish (push) Successful in 49s
2024-06-16 16:15:01 +01:00
290d5c9ccb Fix ollama APi endpoint
All checks were successful
Build and Publish / build_and_publish (push) Successful in 51s
2024-06-16 16:10:50 +01:00
448a23affa Allow importing legacy threads
All checks were successful
Build and Publish / build_and_publish (push) Successful in 58s
2024-06-16 15:53:43 +01:00
d203376850
Update the README
All checks were successful
Build and Publish / build_and_publish (push) Successful in 1m49s
2024-06-11 01:56:03 +01:00
76d3684449
Remove reference to throttle
All checks were successful
Build and Publish / build_and_publish (push) Successful in 45s
2024-06-11 01:44:34 +01:00
d4d550d7ba
Add a proper timeout to is_online
Some checks failed
Build and Publish / build_and_publish (push) Has been cancelled
2024-06-11 01:44:09 +01:00
b6d747a63b
Don't follow up with an empty embed
All checks were successful
Build and Publish / build_and_publish (push) Successful in 44s
2024-06-11 01:41:20 +01:00
e32d866ad4
add PS command
All checks were successful
Build and Publish / build_and_publish (push) Successful in 48s
2024-06-11 01:37:20 +01:00
3c61504cb3
Fix /ollama pull 2024-06-11 01:21:34 +01:00
af11baeeaa
Clarify on-the-fly server names
All checks were successful
Build and Publish / build_and_publish (push) Successful in 45s
2024-06-11 01:15:25 +01:00
954d01bca5
Add server info command
All checks were successful
Build and Publish / build_and_publish (push) Successful in 50s
2024-06-11 01:09:59 +01:00
c04e73dff9
Properly build master
All checks were successful
Build and Publish / build_and_publish (push) Successful in 44s
2024-06-11 01:03:51 +01:00
28908f217c
Enable ollama pull
Some checks failed
Build and Publish / build_and_publish (push) Failing after 1m53s
2024-06-11 00:58:17 +01:00
99001a60ba
Enable on-the-fly server construction 2024-06-11 00:53:48 +01:00
5 changed files with 324 additions and 29 deletions

View file

@ -16,10 +16,7 @@ jobs:
id: meta
uses: docker/metadata-action@v5
with:
images: |
git.i-am.nexus/nex/sentient-jimmy
tags: |
type=sha
images: git.i-am.nexus/nex/sentient-jimmy
- name: Log into forgejo CR
uses: docker/login-action@v3

View file

@ -8,6 +8,14 @@ Another Ollama bot for discord, however designed for mesh self-hosting.
[bot]
token = "your-bot-token"
debug_guilds = [0123456789] # omit for global commands
db_url = "sqlite://:memory:"
# ^ The database URL. Overridden by $DATABASE_URL.
# The default in a docker environment is IN MEMORY, i.e. `sqlite://:memory:`.
# The default in a non-docker environment is sqlite://default.db, aka sqlite @ ./default.db
# If $DATABASE_URL is set, it will override this setting.
# You can use SQLite, or PostgreSQL.
# You can choose to build jimmy with mysql/mssql&oracle support by changing extra `asyncpg` to `asyncmy`/`asyncodbc`
# in the tortoise-orm requirement in requrements.txt
[ollama]
order = ["server1", "server2", "fallback"]
@ -17,14 +25,23 @@ order = ["server1", "server2", "fallback"]
base_url = "https://hosted.ollama.internal" # default port is 443, because HTTPS
gpu = true
vram_gb = 8
default_model="llama3:latest" # sets the default model for /ollama chat
[ollama.server2]
base_url = "http://192.168.1.2:11434"
gpu = true
vram_gb = 4 # <8GB will enable "low VRAM mode" in ollama
default_model = "llama2:latest"
[ollama.fallback]
base_url = "http://192.168.1.250:11434"
gpu = false
vram_gb = 32 # in the case of CPU Ollama, "vram" is actually just regular RAM.
default_model = "orca-mini:3b"
```
## Running
See [the example docker-compose.yml](/docker-compose.yml) for an example of how to run this bot with docker-compose.
Alternatively, you can just run the docker image: `git.i-am.nexus/nex/sentient-jimmy:master`.

View file

@ -1,4 +1,5 @@
import asyncio
import datetime
import io
import logging
import time
@ -6,11 +7,12 @@ import typing
import contextlib
import discord
import httpx
from discord import Interaction
from ollama import AsyncClient, ResponseError, Options
from discord.ext import commands
from jimmy.utils import create_ollama_message, find_suitable_server, decorate_server_name as decorate_name
from jimmy.config import get_servers, get_server
from jimmy.config import get_servers, get_server, get_config
from jimmy.db import OllamaThread
from humanize import naturalsize, naturaldelta
@ -46,10 +48,13 @@ async def get_available_tags_autocomplete(ctx: discord.AutocompleteContext):
chosen_server = get_server(ctx.options.get("server") or get_servers()[0].name)
async with ollama_client(str(chosen_server.base_url), timeout=2) as client:
tags = (await client.list())["models"]
return [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
v = [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
return [ctx.value, *v][:25]
_ServerOptionChoices = [discord.OptionChoice(server.name, server.name) for server in get_servers()]
_ServerOptionAutocomplete = discord.utils.basic_autocomplete(
[x.name for x in get_servers()]
)
class Chat(commands.Cog):
@ -60,7 +65,13 @@ class Chat(commands.Cog):
self.server_locks[server.name] = asyncio.Lock()
self.log = logging.getLogger(__name__)
@commands.slash_command()
ollama_group = discord.SlashCommandGroup(
name="ollama",
description="Commands related to ollama.",
guild_only=True
)
@ollama_group.command()
async def status(self, ctx: discord.ApplicationContext):
"""Checks the status on all servers."""
await ctx.defer()
@ -71,10 +82,10 @@ class Chat(commands.Cog):
)
fields = {}
for server in get_servers():
if server.throttle and self.server_locks[server.name].locked():
if self.server_locks[server.name].locked():
embed.add_field(
name=decorate_name(server),
value=f"\N{closed lock with key} In use.",
value="\N{closed lock with key} In use.",
inline=False
)
fields[server] = len(embed.fields) - 1
@ -82,7 +93,7 @@ class Chat(commands.Cog):
else:
embed.add_field(
name=decorate_name(server),
value=f"\N{hourglass with flowing sand} Waiting...",
value="\N{hourglass with flowing sand} Waiting...",
inline=False
)
fields[server] = len(embed.fields) - 1
@ -90,7 +101,7 @@ class Chat(commands.Cog):
await ctx.respond(embed=embed)
tasks = {}
for server in get_servers():
if server.throttle and self.server_locks[server.name].locked():
if self.server_locks[server.name].locked():
continue
tasks[server] = asyncio.create_task(server.is_online())
@ -100,19 +111,52 @@ class Chat(commands.Cog):
embed.set_field_at(
fields[server],
name=decorate_name(server),
value=f"\N{white heavy check mark} Online.",
value="\N{white heavy check mark} Online.",
inline=False
)
else:
embed.set_field_at(
fields[server],
name=decorate_name(server),
value=f"\N{cross mark} Offline.",
value="\N{cross mark} Offline.",
inline=False
)
await ctx.edit(embed=embed)
@commands.slash_command(name="ollama")
@ollama_group.command(name="server-info")
async def get_server_info(
self,
ctx: discord.ApplicationContext,
server: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
]
):
"""Gets information on a given server"""
await ctx.defer()
server = get_server(server)
is_online = await server.is_online()
y = "\N{white heavy check mark}"
x = "\N{cross mark}"
t = {True: y, False: x}
rt = "VRAM" if server.gpu else "RAM"
lines = [
f"Name: {server.name!r}",
f"Base URL: {server.base_url!r}",
f"GPU Enabled: {t[server.gpu]}",
f"{rt}: {server.vram_gb:,} GB",
f"Default Model: {server.default_model!r}",
f"Is Online: {t[is_online]}"
]
p = "```md\n" + "\n".join(lines) + "```"
return await ctx.respond(p)
@ollama_group.command(name="chat")
async def start_ollama_chat(
self,
ctx: discord.ApplicationContext,
@ -130,7 +174,7 @@ class Chat(commands.Cog):
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
choices=_ServerOptionChoices,
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
],
@ -140,7 +184,7 @@ class Chat(commands.Cog):
discord.SlashCommandOptionType.string,
description="The model to use.",
autocomplete=get_available_tags_autocomplete,
default="llama3:latest"
default="default"
)
],
image: typing.Annotated[
@ -173,7 +217,9 @@ class Chat(commands.Cog):
"""Have a chat with ollama"""
await ctx.defer()
server = get_server(server)
if not await server.is_online():
if not server:
return await ctx.respond("\N{cross mark} Unknown Server.")
elif not await server.is_online():
await ctx.respond(
content=f"{server} is offline. Finding a suitable server...",
)
@ -183,14 +229,17 @@ class Chat(commands.Cog):
return await ctx.edit(content=str(err), delete_after=30)
await ctx.delete(delay=5)
async with self.server_locks[server.name]:
if model == "default":
model = server.default_model
async with ollama_client(str(server.base_url)) as client:
client: AsyncClient
self.log.info("Checking if %r has the model %r", server, model)
tags = (await client.list())["models"]
# Download code. It's recommended to collapse this in the editor.
if model not in [x["model"] for x in tags]:
embed = discord.Embed(
title=f"Downloading {model} on {server}.",
description=f"Initiating download...",
description="Initiating download...",
color=discord.Color.blurple()
)
view = StopDownloadView(ctx)
@ -265,6 +314,7 @@ class Chat(commands.Cog):
await ctx.edit(embed=embed, delete_after=30, view=None)
messages = []
thread = None
if thread_id:
thread = await OllamaThread.get_or_none(thread_id=thread_id)
if thread:
@ -272,8 +322,29 @@ class Chat(commands.Cog):
messages.append(
await create_ollama_message(msg["content"], role=msg["role"])
)
elif len(thread_id) == 6:
# Is a legacy thread
_cfg = get_config()["truth_api"]
async with httpx.AsyncClient(
base_url=_cfg["url"],
auth=(_cfg["username"], _cfg["password"])
) as http_client:
response = await http_client.get(f"/ollama/thread/threads:{thread_id}")
if response.status_code == 200:
thread = response.json()
messages = thread["messages"]
thread = OllamaThread(
messages=[{"role": m["role"], "content": m["content"]} for m in messages],
)
await thread.save()
else:
return await ctx.respond(
content="Failed to fetch legacy ollama thread from jimmy v2: HTTP %d (`%r`)" % (
response.status_code, response.text
),
)
else:
await ctx.respond(content="No thread with that ID exists.", delete_after=30)
return await ctx.respond(content="No thread with that ID exists.", delete_after=30)
if system_prompt:
messages.append(await create_ollama_message(system_prompt, role="system"))
messages.append(await create_ollama_message(prompt, images=[await image.read()] if image else None))
@ -325,18 +396,187 @@ class Chat(commands.Cog):
embed.add_field(
name="Full chat",
value="The chat was too long to fit in this message. "
f"You can download the `full-chat.txt` file to see the full message."
"You can download the `full-chat.txt` file to see the full message."
)
else:
file = discord.utils.MISSING
thread = OllamaThread(
messages=[{"role": m["role"], "content": m["content"]} for m in messages],
)
await thread.save()
if not thread:
thread = OllamaThread(
messages=[{"role": m["role"], "content": m["content"]} for m in messages],
)
await thread.save()
embed.set_footer(text=f"Chat ID: {thread.thread_id}")
await msg.edit(embed=embed, view=None, file=file)
@ollama_group.command(name="pull")
async def pull_ollama_model(
self,
ctx: discord.ApplicationContext,
server: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
],
model: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The model to use.",
autocomplete=get_available_tags_autocomplete,
default="llama3:latest"
)
],
):
"""Downloads a tag on the target server"""
await ctx.defer()
server = get_server(server)
if not server:
return await ctx.respond("\N{cross mark} Unknown server.")
elif not await server.is_online():
return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
embed = discord.Embed(
title=f"Downloading {model} on {server}.",
description="Initiating download...",
color=discord.Color.blurple()
)
view = StopDownloadView(ctx)
await ctx.respond(
embed=embed,
view=view
)
last_edit = 0
async with ctx.typing():
try:
last_completed = 0
last_completed_ts = time.time()
async with ollama_client(str(server.base_url)) as client:
async for line in await client.pull(model, stream=True):
if view.event.is_set():
embed.add_field(name="Error!", value="Download cancelled.")
embed.colour = discord.Colour.red()
await ctx.edit(embed=embed)
return
self.log.debug("Response from %r: %r", server, line)
if line["status"] in {
"pulling manifest",
"verifying sha256 digest",
"writing manifest",
"removing any unused layers",
"success"
}:
embed.description = line["status"].capitalize()
else:
total = line["total"]
completed = line.get("completed", 0)
percent = round(completed / total * 100, 1)
pb_fill = "" * int(percent / 10)
pb_empty = "" * (10 - int(percent / 10))
bytes_per_second = completed - last_completed
bytes_per_second /= (time.time() - last_completed_ts)
last_completed = completed
last_completed_ts = time.time()
mbps = round((bytes_per_second * 8) / 1024 / 1024)
eta = (total - completed) / max(1, bytes_per_second)
progress_bar = f"[{pb_fill}{pb_empty}]"
ns_total = naturalsize(total, binary=True)
ns_completed = naturalsize(completed, binary=True)
embed.description = (
f"{line['status'].capitalize()} {percent}% {progress_bar} "
f"({ns_completed}/{ns_total} @ {mbps} Mb/s) "
f"[ETA: {naturaldelta(eta)}]"
)
if time.time() - last_edit >= 2.5:
await ctx.edit(embed=embed)
last_edit = time.time()
except ResponseError as err:
if err.error.endswith("file does not exist"):
await ctx.edit(
embed=None,
content="The model %r does not exist." % model,
delete_after=60,
view=None
)
else:
embed.add_field(
name="Error!",
value=err.error
)
embed.colour = discord.Colour.red()
await ctx.edit(embed=embed, view=None)
return
else:
embed.colour = discord.Colour.green()
embed.description = f"Downloaded {model} on {server}."
await ctx.edit(embed=embed, delete_after=30, view=None)
@ollama_group.command(name="ps")
async def ollama_proc_list(
self,
ctx: discord.ApplicationContext,
server: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
]
):
"""Checks the loaded models on the target server"""
await ctx.defer()
server = get_server(server)
if not server:
return await ctx.respond("\N{cross mark} Unknown server.")
elif not await server.is_online():
return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
async with ollama_client(str(server.base_url)) as client:
response = (await client.ps())["models"]
if not response:
embed = discord.Embed(
title=f"No models loaded on {server}.",
color=discord.Color.blurple()
)
return await ctx.respond(embed=embed)
embed = discord.Embed(
title=f"Models loaded on {server}",
color=discord.Color.blurple()
)
for model in response[:25]:
size = naturalsize(model["size"], binary=True)
size_vram = naturalsize(model["size_vram"], binary=True)
size_ram = naturalsize(model["size"] - model["size_vram"], binary=True)
percent_in_vram = round(model["size_vram"] / model["size"] * 100)
percent_in_ram = 100 - percent_in_vram
expires = datetime.datetime.fromisoformat(model["expires_at"])
lines = [
f"* Size: {size}",
f"* Unloaded: {discord.utils.format_dt(expires, style='R')}",
]
if percent_in_ram > 0:
lines.extend(
[
f"* VRAM/RAM: {percent_in_vram}%/{percent_in_ram}%",
f"* VRAM Size: {size_vram}",
f"* RAM Size: {size_ram}"
]
)
else:
lines.append(f"* VRAM Size: {size_vram} (100%)")
embed.add_field(
name=model["model"],
value="\n".join(lines),
inline=False
)
await ctx.respond(embed=embed)
def setup(bot):
bot.add_cog(Chat(bot))

View file

@ -1,6 +1,7 @@
import os
import tomllib
import logging
import urllib.parse
from typing import Callable
import httpx
@ -10,11 +11,11 @@ log = logging.getLogger(__name__)
class ServerConfig(BaseModel):
name: str = Field(min_length=1, max_length=32)
name: str = Field(min_length=1, max_length=4096)
base_url: AnyHttpUrl
gpu: bool = False
vram_gb: int = 4
throttle: bool = False
default_model: str = "llama3:latest"
def __repr__(self):
return "<ServerConfig name={0.name} base_url={0.base_url} gpu={0.gpu!s} vram_gb={0.vram_gb}>".format(self)
@ -26,7 +27,7 @@ class ServerConfig(BaseModel):
"""
Checks that the current server is online and responding to requests.
"""
async with httpx.AsyncClient(base_url=str(self.base_url)) as client:
async with httpx.AsyncClient(base_url=str(self.base_url), timeout=httpx.Timeout(2.25)) as client:
try:
response = await client.get("/api/tags")
return response.status_code == 200
@ -57,6 +58,40 @@ def get_server(name_or_base_url: str) -> ServerConfig | None:
for server in servers:
if server.name == name_or_base_url or server.base_url == name_or_base_url:
return server
try:
parsed = urllib.parse.urlparse(name_or_base_url)
except ValueError:
pass
else:
if parsed.netloc and parsed.scheme in ["http", "https"]:
defaults = {
"name": ":temporary:-:%s:" % parsed.hostname,
"base_url": "{0.scheme}://{0.netloc}".format(parsed),
"gpu": False,
"vram_gb": 2,
"default_model": "orca-mini:3b"
}
if parsed.path and parsed.path.endswith(("/api", "/api/")):
defaults["base_url"] += parsed.path
parsed_qs = urllib.parse.parse_qs(parsed.query)
for key, values in parsed_qs.items():
if not values:
continue
if key == "gpu":
values = [
values[0][0].lower() in ("t", "1", "y")
]
elif key == "vram_gb":
try:
values = [
int(values[0])
]
except ValueError:
values = []
if values:
defaults[key] = values[0]
return ServerConfig(**defaults)
return None
@ -67,6 +102,10 @@ def get_config():
_loaded.setdefault("servers", {})
_loaded["servers"].setdefault("order", [])
_loaded.setdefault("bot", {})
_loaded.setdefault("truth_api", {})
_loaded["truth_api"].setdefault("url", "https://bots.nexy7574.co.uk/jimmy/v2/api")
_loaded["truth_api"].setdefault("username", "invalid")
_loaded["truth_api"].setdefault("password", "invalid")
if database_url := os.getenv("DATABASE_URL"):
_loaded["bot"]["db_url"] = database_url
return _loaded

2
tox.ini Normal file
View file

@ -0,0 +1,2 @@
[flake8]
max-line-length = 120