Compare commits

...

15 commits

Author SHA1 Message Date
nex
9a42ba815a Act cache should be properly configured now
Some checks failed
Build and Publish / build_and_publish (push) Failing after 2m39s
2024-06-17 00:53:14 +01:00
597ffd386c Fix TypeError
All checks were successful
Build and Publish / build_and_publish (push) Successful in 49s
2024-06-16 16:15:01 +01:00
290d5c9ccb Fix ollama APi endpoint
All checks were successful
Build and Publish / build_and_publish (push) Successful in 51s
2024-06-16 16:10:50 +01:00
448a23affa Allow importing legacy threads
All checks were successful
Build and Publish / build_and_publish (push) Successful in 58s
2024-06-16 15:53:43 +01:00
d203376850
Update the README
All checks were successful
Build and Publish / build_and_publish (push) Successful in 1m49s
2024-06-11 01:56:03 +01:00
76d3684449
Remove reference to throttle
All checks were successful
Build and Publish / build_and_publish (push) Successful in 45s
2024-06-11 01:44:34 +01:00
d4d550d7ba
Add a proper timeout to is_online
Some checks failed
Build and Publish / build_and_publish (push) Has been cancelled
2024-06-11 01:44:09 +01:00
b6d747a63b
Don't follow up with an empty embed
All checks were successful
Build and Publish / build_and_publish (push) Successful in 44s
2024-06-11 01:41:20 +01:00
e32d866ad4
add PS command
All checks were successful
Build and Publish / build_and_publish (push) Successful in 48s
2024-06-11 01:37:20 +01:00
3c61504cb3
Fix /ollama pull 2024-06-11 01:21:34 +01:00
af11baeeaa
Clarify on-the-fly server names
All checks were successful
Build and Publish / build_and_publish (push) Successful in 45s
2024-06-11 01:15:25 +01:00
954d01bca5
Add server info command
All checks were successful
Build and Publish / build_and_publish (push) Successful in 50s
2024-06-11 01:09:59 +01:00
c04e73dff9
Properly build master
All checks were successful
Build and Publish / build_and_publish (push) Successful in 44s
2024-06-11 01:03:51 +01:00
28908f217c
Enable ollama pull
Some checks failed
Build and Publish / build_and_publish (push) Failing after 1m53s
2024-06-11 00:58:17 +01:00
99001a60ba
Enable on-the-fly server construction 2024-06-11 00:53:48 +01:00
5 changed files with 324 additions and 29 deletions

View file

@ -16,10 +16,7 @@ jobs:
id: meta id: meta
uses: docker/metadata-action@v5 uses: docker/metadata-action@v5
with: with:
images: | images: git.i-am.nexus/nex/sentient-jimmy
git.i-am.nexus/nex/sentient-jimmy
tags: |
type=sha
- name: Log into forgejo CR - name: Log into forgejo CR
uses: docker/login-action@v3 uses: docker/login-action@v3
@ -36,4 +33,4 @@ jobs:
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max

View file

@ -8,6 +8,14 @@ Another Ollama bot for discord, however designed for mesh self-hosting.
[bot] [bot]
token = "your-bot-token" token = "your-bot-token"
debug_guilds = [0123456789] # omit for global commands debug_guilds = [0123456789] # omit for global commands
db_url = "sqlite://:memory:"
# ^ The database URL. Overridden by $DATABASE_URL.
# The default in a docker environment is IN MEMORY, i.e. `sqlite://:memory:`.
# The default in a non-docker environment is sqlite://default.db, aka sqlite @ ./default.db
# If $DATABASE_URL is set, it will override this setting.
# You can use SQLite, or PostgreSQL.
# You can choose to build jimmy with mysql/mssql&oracle support by changing extra `asyncpg` to `asyncmy`/`asyncodbc`
# in the tortoise-orm requirement in requrements.txt
[ollama] [ollama]
order = ["server1", "server2", "fallback"] order = ["server1", "server2", "fallback"]
@ -17,14 +25,23 @@ order = ["server1", "server2", "fallback"]
base_url = "https://hosted.ollama.internal" # default port is 443, because HTTPS base_url = "https://hosted.ollama.internal" # default port is 443, because HTTPS
gpu = true gpu = true
vram_gb = 8 vram_gb = 8
default_model="llama3:latest" # sets the default model for /ollama chat
[ollama.server2] [ollama.server2]
base_url = "http://192.168.1.2:11434" base_url = "http://192.168.1.2:11434"
gpu = true gpu = true
vram_gb = 4 # <8GB will enable "low VRAM mode" in ollama vram_gb = 4 # <8GB will enable "low VRAM mode" in ollama
default_model = "llama2:latest"
[ollama.fallback] [ollama.fallback]
base_url = "http://192.168.1.250:11434" base_url = "http://192.168.1.250:11434"
gpu = false gpu = false
vram_gb = 32 # in the case of CPU Ollama, "vram" is actually just regular RAM. vram_gb = 32 # in the case of CPU Ollama, "vram" is actually just regular RAM.
default_model = "orca-mini:3b"
``` ```
## Running
See [the example docker-compose.yml](/docker-compose.yml) for an example of how to run this bot with docker-compose.
Alternatively, you can just run the docker image: `git.i-am.nexus/nex/sentient-jimmy:master`.

View file

@ -1,4 +1,5 @@
import asyncio import asyncio
import datetime
import io import io
import logging import logging
import time import time
@ -6,11 +7,12 @@ import typing
import contextlib import contextlib
import discord import discord
import httpx
from discord import Interaction from discord import Interaction
from ollama import AsyncClient, ResponseError, Options from ollama import AsyncClient, ResponseError, Options
from discord.ext import commands from discord.ext import commands
from jimmy.utils import create_ollama_message, find_suitable_server, decorate_server_name as decorate_name from jimmy.utils import create_ollama_message, find_suitable_server, decorate_server_name as decorate_name
from jimmy.config import get_servers, get_server from jimmy.config import get_servers, get_server, get_config
from jimmy.db import OllamaThread from jimmy.db import OllamaThread
from humanize import naturalsize, naturaldelta from humanize import naturalsize, naturaldelta
@ -46,10 +48,13 @@ async def get_available_tags_autocomplete(ctx: discord.AutocompleteContext):
chosen_server = get_server(ctx.options.get("server") or get_servers()[0].name) chosen_server = get_server(ctx.options.get("server") or get_servers()[0].name)
async with ollama_client(str(chosen_server.base_url), timeout=2) as client: async with ollama_client(str(chosen_server.base_url), timeout=2) as client:
tags = (await client.list())["models"] tags = (await client.list())["models"]
return [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
v = [tag["model"] for tag in tags if ctx.value.casefold() in tag["model"].casefold()]
return [ctx.value, *v][:25]
_ServerOptionChoices = [discord.OptionChoice(server.name, server.name) for server in get_servers()] _ServerOptionAutocomplete = discord.utils.basic_autocomplete(
[x.name for x in get_servers()]
)
class Chat(commands.Cog): class Chat(commands.Cog):
@ -60,7 +65,13 @@ class Chat(commands.Cog):
self.server_locks[server.name] = asyncio.Lock() self.server_locks[server.name] = asyncio.Lock()
self.log = logging.getLogger(__name__) self.log = logging.getLogger(__name__)
@commands.slash_command() ollama_group = discord.SlashCommandGroup(
name="ollama",
description="Commands related to ollama.",
guild_only=True
)
@ollama_group.command()
async def status(self, ctx: discord.ApplicationContext): async def status(self, ctx: discord.ApplicationContext):
"""Checks the status on all servers.""" """Checks the status on all servers."""
await ctx.defer() await ctx.defer()
@ -71,10 +82,10 @@ class Chat(commands.Cog):
) )
fields = {} fields = {}
for server in get_servers(): for server in get_servers():
if server.throttle and self.server_locks[server.name].locked(): if self.server_locks[server.name].locked():
embed.add_field( embed.add_field(
name=decorate_name(server), name=decorate_name(server),
value=f"\N{closed lock with key} In use.", value="\N{closed lock with key} In use.",
inline=False inline=False
) )
fields[server] = len(embed.fields) - 1 fields[server] = len(embed.fields) - 1
@ -82,7 +93,7 @@ class Chat(commands.Cog):
else: else:
embed.add_field( embed.add_field(
name=decorate_name(server), name=decorate_name(server),
value=f"\N{hourglass with flowing sand} Waiting...", value="\N{hourglass with flowing sand} Waiting...",
inline=False inline=False
) )
fields[server] = len(embed.fields) - 1 fields[server] = len(embed.fields) - 1
@ -90,7 +101,7 @@ class Chat(commands.Cog):
await ctx.respond(embed=embed) await ctx.respond(embed=embed)
tasks = {} tasks = {}
for server in get_servers(): for server in get_servers():
if server.throttle and self.server_locks[server.name].locked(): if self.server_locks[server.name].locked():
continue continue
tasks[server] = asyncio.create_task(server.is_online()) tasks[server] = asyncio.create_task(server.is_online())
@ -100,19 +111,52 @@ class Chat(commands.Cog):
embed.set_field_at( embed.set_field_at(
fields[server], fields[server],
name=decorate_name(server), name=decorate_name(server),
value=f"\N{white heavy check mark} Online.", value="\N{white heavy check mark} Online.",
inline=False inline=False
) )
else: else:
embed.set_field_at( embed.set_field_at(
fields[server], fields[server],
name=decorate_name(server), name=decorate_name(server),
value=f"\N{cross mark} Offline.", value="\N{cross mark} Offline.",
inline=False inline=False
) )
await ctx.edit(embed=embed) await ctx.edit(embed=embed)
@commands.slash_command(name="ollama") @ollama_group.command(name="server-info")
async def get_server_info(
self,
ctx: discord.ApplicationContext,
server: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
]
):
"""Gets information on a given server"""
await ctx.defer()
server = get_server(server)
is_online = await server.is_online()
y = "\N{white heavy check mark}"
x = "\N{cross mark}"
t = {True: y, False: x}
rt = "VRAM" if server.gpu else "RAM"
lines = [
f"Name: {server.name!r}",
f"Base URL: {server.base_url!r}",
f"GPU Enabled: {t[server.gpu]}",
f"{rt}: {server.vram_gb:,} GB",
f"Default Model: {server.default_model!r}",
f"Is Online: {t[is_online]}"
]
p = "```md\n" + "\n".join(lines) + "```"
return await ctx.respond(p)
@ollama_group.command(name="chat")
async def start_ollama_chat( async def start_ollama_chat(
self, self,
ctx: discord.ApplicationContext, ctx: discord.ApplicationContext,
@ -130,7 +174,7 @@ class Chat(commands.Cog):
discord.Option( discord.Option(
discord.SlashCommandOptionType.string, discord.SlashCommandOptionType.string,
description="The server to use.", description="The server to use.",
choices=_ServerOptionChoices, autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name default=get_servers()[0].name
) )
], ],
@ -140,7 +184,7 @@ class Chat(commands.Cog):
discord.SlashCommandOptionType.string, discord.SlashCommandOptionType.string,
description="The model to use.", description="The model to use.",
autocomplete=get_available_tags_autocomplete, autocomplete=get_available_tags_autocomplete,
default="llama3:latest" default="default"
) )
], ],
image: typing.Annotated[ image: typing.Annotated[
@ -173,7 +217,9 @@ class Chat(commands.Cog):
"""Have a chat with ollama""" """Have a chat with ollama"""
await ctx.defer() await ctx.defer()
server = get_server(server) server = get_server(server)
if not await server.is_online(): if not server:
return await ctx.respond("\N{cross mark} Unknown Server.")
elif not await server.is_online():
await ctx.respond( await ctx.respond(
content=f"{server} is offline. Finding a suitable server...", content=f"{server} is offline. Finding a suitable server...",
) )
@ -183,14 +229,17 @@ class Chat(commands.Cog):
return await ctx.edit(content=str(err), delete_after=30) return await ctx.edit(content=str(err), delete_after=30)
await ctx.delete(delay=5) await ctx.delete(delay=5)
async with self.server_locks[server.name]: async with self.server_locks[server.name]:
if model == "default":
model = server.default_model
async with ollama_client(str(server.base_url)) as client: async with ollama_client(str(server.base_url)) as client:
client: AsyncClient client: AsyncClient
self.log.info("Checking if %r has the model %r", server, model) self.log.info("Checking if %r has the model %r", server, model)
tags = (await client.list())["models"] tags = (await client.list())["models"]
# Download code. It's recommended to collapse this in the editor.
if model not in [x["model"] for x in tags]: if model not in [x["model"] for x in tags]:
embed = discord.Embed( embed = discord.Embed(
title=f"Downloading {model} on {server}.", title=f"Downloading {model} on {server}.",
description=f"Initiating download...", description="Initiating download...",
color=discord.Color.blurple() color=discord.Color.blurple()
) )
view = StopDownloadView(ctx) view = StopDownloadView(ctx)
@ -265,6 +314,7 @@ class Chat(commands.Cog):
await ctx.edit(embed=embed, delete_after=30, view=None) await ctx.edit(embed=embed, delete_after=30, view=None)
messages = [] messages = []
thread = None
if thread_id: if thread_id:
thread = await OllamaThread.get_or_none(thread_id=thread_id) thread = await OllamaThread.get_or_none(thread_id=thread_id)
if thread: if thread:
@ -272,8 +322,29 @@ class Chat(commands.Cog):
messages.append( messages.append(
await create_ollama_message(msg["content"], role=msg["role"]) await create_ollama_message(msg["content"], role=msg["role"])
) )
elif len(thread_id) == 6:
# Is a legacy thread
_cfg = get_config()["truth_api"]
async with httpx.AsyncClient(
base_url=_cfg["url"],
auth=(_cfg["username"], _cfg["password"])
) as http_client:
response = await http_client.get(f"/ollama/thread/threads:{thread_id}")
if response.status_code == 200:
thread = response.json()
messages = thread["messages"]
thread = OllamaThread(
messages=[{"role": m["role"], "content": m["content"]} for m in messages],
)
await thread.save()
else:
return await ctx.respond(
content="Failed to fetch legacy ollama thread from jimmy v2: HTTP %d (`%r`)" % (
response.status_code, response.text
),
)
else: else:
await ctx.respond(content="No thread with that ID exists.", delete_after=30) return await ctx.respond(content="No thread with that ID exists.", delete_after=30)
if system_prompt: if system_prompt:
messages.append(await create_ollama_message(system_prompt, role="system")) messages.append(await create_ollama_message(system_prompt, role="system"))
messages.append(await create_ollama_message(prompt, images=[await image.read()] if image else None)) messages.append(await create_ollama_message(prompt, images=[await image.read()] if image else None))
@ -325,18 +396,187 @@ class Chat(commands.Cog):
embed.add_field( embed.add_field(
name="Full chat", name="Full chat",
value="The chat was too long to fit in this message. " value="The chat was too long to fit in this message. "
f"You can download the `full-chat.txt` file to see the full message." "You can download the `full-chat.txt` file to see the full message."
) )
else: else:
file = discord.utils.MISSING file = discord.utils.MISSING
thread = OllamaThread( if not thread:
messages=[{"role": m["role"], "content": m["content"]} for m in messages], thread = OllamaThread(
) messages=[{"role": m["role"], "content": m["content"]} for m in messages],
await thread.save() )
await thread.save()
embed.set_footer(text=f"Chat ID: {thread.thread_id}") embed.set_footer(text=f"Chat ID: {thread.thread_id}")
await msg.edit(embed=embed, view=None, file=file) await msg.edit(embed=embed, view=None, file=file)
@ollama_group.command(name="pull")
async def pull_ollama_model(
self,
ctx: discord.ApplicationContext,
server: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
],
model: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The model to use.",
autocomplete=get_available_tags_autocomplete,
default="llama3:latest"
)
],
):
"""Downloads a tag on the target server"""
await ctx.defer()
server = get_server(server)
if not server:
return await ctx.respond("\N{cross mark} Unknown server.")
elif not await server.is_online():
return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
embed = discord.Embed(
title=f"Downloading {model} on {server}.",
description="Initiating download...",
color=discord.Color.blurple()
)
view = StopDownloadView(ctx)
await ctx.respond(
embed=embed,
view=view
)
last_edit = 0
async with ctx.typing():
try:
last_completed = 0
last_completed_ts = time.time()
async with ollama_client(str(server.base_url)) as client:
async for line in await client.pull(model, stream=True):
if view.event.is_set():
embed.add_field(name="Error!", value="Download cancelled.")
embed.colour = discord.Colour.red()
await ctx.edit(embed=embed)
return
self.log.debug("Response from %r: %r", server, line)
if line["status"] in {
"pulling manifest",
"verifying sha256 digest",
"writing manifest",
"removing any unused layers",
"success"
}:
embed.description = line["status"].capitalize()
else:
total = line["total"]
completed = line.get("completed", 0)
percent = round(completed / total * 100, 1)
pb_fill = "" * int(percent / 10)
pb_empty = "" * (10 - int(percent / 10))
bytes_per_second = completed - last_completed
bytes_per_second /= (time.time() - last_completed_ts)
last_completed = completed
last_completed_ts = time.time()
mbps = round((bytes_per_second * 8) / 1024 / 1024)
eta = (total - completed) / max(1, bytes_per_second)
progress_bar = f"[{pb_fill}{pb_empty}]"
ns_total = naturalsize(total, binary=True)
ns_completed = naturalsize(completed, binary=True)
embed.description = (
f"{line['status'].capitalize()} {percent}% {progress_bar} "
f"({ns_completed}/{ns_total} @ {mbps} Mb/s) "
f"[ETA: {naturaldelta(eta)}]"
)
if time.time() - last_edit >= 2.5:
await ctx.edit(embed=embed)
last_edit = time.time()
except ResponseError as err:
if err.error.endswith("file does not exist"):
await ctx.edit(
embed=None,
content="The model %r does not exist." % model,
delete_after=60,
view=None
)
else:
embed.add_field(
name="Error!",
value=err.error
)
embed.colour = discord.Colour.red()
await ctx.edit(embed=embed, view=None)
return
else:
embed.colour = discord.Colour.green()
embed.description = f"Downloaded {model} on {server}."
await ctx.edit(embed=embed, delete_after=30, view=None)
@ollama_group.command(name="ps")
async def ollama_proc_list(
self,
ctx: discord.ApplicationContext,
server: typing.Annotated[
str,
discord.Option(
discord.SlashCommandOptionType.string,
description="The server to use.",
autocomplete=_ServerOptionAutocomplete,
default=get_servers()[0].name
)
]
):
"""Checks the loaded models on the target server"""
await ctx.defer()
server = get_server(server)
if not server:
return await ctx.respond("\N{cross mark} Unknown server.")
elif not await server.is_online():
return await ctx.respond(f"\N{cross mark} Server {server.name!r} is not responding")
async with ollama_client(str(server.base_url)) as client:
response = (await client.ps())["models"]
if not response:
embed = discord.Embed(
title=f"No models loaded on {server}.",
color=discord.Color.blurple()
)
return await ctx.respond(embed=embed)
embed = discord.Embed(
title=f"Models loaded on {server}",
color=discord.Color.blurple()
)
for model in response[:25]:
size = naturalsize(model["size"], binary=True)
size_vram = naturalsize(model["size_vram"], binary=True)
size_ram = naturalsize(model["size"] - model["size_vram"], binary=True)
percent_in_vram = round(model["size_vram"] / model["size"] * 100)
percent_in_ram = 100 - percent_in_vram
expires = datetime.datetime.fromisoformat(model["expires_at"])
lines = [
f"* Size: {size}",
f"* Unloaded: {discord.utils.format_dt(expires, style='R')}",
]
if percent_in_ram > 0:
lines.extend(
[
f"* VRAM/RAM: {percent_in_vram}%/{percent_in_ram}%",
f"* VRAM Size: {size_vram}",
f"* RAM Size: {size_ram}"
]
)
else:
lines.append(f"* VRAM Size: {size_vram} (100%)")
embed.add_field(
name=model["model"],
value="\n".join(lines),
inline=False
)
await ctx.respond(embed=embed)
def setup(bot): def setup(bot):
bot.add_cog(Chat(bot)) bot.add_cog(Chat(bot))

View file

@ -1,6 +1,7 @@
import os import os
import tomllib import tomllib
import logging import logging
import urllib.parse
from typing import Callable from typing import Callable
import httpx import httpx
@ -10,11 +11,11 @@ log = logging.getLogger(__name__)
class ServerConfig(BaseModel): class ServerConfig(BaseModel):
name: str = Field(min_length=1, max_length=32) name: str = Field(min_length=1, max_length=4096)
base_url: AnyHttpUrl base_url: AnyHttpUrl
gpu: bool = False gpu: bool = False
vram_gb: int = 4 vram_gb: int = 4
throttle: bool = False default_model: str = "llama3:latest"
def __repr__(self): def __repr__(self):
return "<ServerConfig name={0.name} base_url={0.base_url} gpu={0.gpu!s} vram_gb={0.vram_gb}>".format(self) return "<ServerConfig name={0.name} base_url={0.base_url} gpu={0.gpu!s} vram_gb={0.vram_gb}>".format(self)
@ -26,7 +27,7 @@ class ServerConfig(BaseModel):
""" """
Checks that the current server is online and responding to requests. Checks that the current server is online and responding to requests.
""" """
async with httpx.AsyncClient(base_url=str(self.base_url)) as client: async with httpx.AsyncClient(base_url=str(self.base_url), timeout=httpx.Timeout(2.25)) as client:
try: try:
response = await client.get("/api/tags") response = await client.get("/api/tags")
return response.status_code == 200 return response.status_code == 200
@ -57,6 +58,40 @@ def get_server(name_or_base_url: str) -> ServerConfig | None:
for server in servers: for server in servers:
if server.name == name_or_base_url or server.base_url == name_or_base_url: if server.name == name_or_base_url or server.base_url == name_or_base_url:
return server return server
try:
parsed = urllib.parse.urlparse(name_or_base_url)
except ValueError:
pass
else:
if parsed.netloc and parsed.scheme in ["http", "https"]:
defaults = {
"name": ":temporary:-:%s:" % parsed.hostname,
"base_url": "{0.scheme}://{0.netloc}".format(parsed),
"gpu": False,
"vram_gb": 2,
"default_model": "orca-mini:3b"
}
if parsed.path and parsed.path.endswith(("/api", "/api/")):
defaults["base_url"] += parsed.path
parsed_qs = urllib.parse.parse_qs(parsed.query)
for key, values in parsed_qs.items():
if not values:
continue
if key == "gpu":
values = [
values[0][0].lower() in ("t", "1", "y")
]
elif key == "vram_gb":
try:
values = [
int(values[0])
]
except ValueError:
values = []
if values:
defaults[key] = values[0]
return ServerConfig(**defaults)
return None return None
@ -67,6 +102,10 @@ def get_config():
_loaded.setdefault("servers", {}) _loaded.setdefault("servers", {})
_loaded["servers"].setdefault("order", []) _loaded["servers"].setdefault("order", [])
_loaded.setdefault("bot", {}) _loaded.setdefault("bot", {})
_loaded.setdefault("truth_api", {})
_loaded["truth_api"].setdefault("url", "https://bots.nexy7574.co.uk/jimmy/v2/api")
_loaded["truth_api"].setdefault("username", "invalid")
_loaded["truth_api"].setdefault("password", "invalid")
if database_url := os.getenv("DATABASE_URL"): if database_url := os.getenv("DATABASE_URL"):
_loaded["bot"]["db_url"] = database_url _loaded["bot"]["db_url"] = database_url
return _loaded return _loaded

2
tox.ini Normal file
View file

@ -0,0 +1,2 @@
[flake8]
max-line-length = 120