From e28c75734bd4db87478f16427805aa0613c48247 Mon Sep 17 00:00:00 2001 From: vmpuri <45368418+vmpuri@users.noreply.github.com> Date: Mon, 12 Aug 2024 15:38:42 -0700 Subject: [PATCH] Make API and server compatible with OpenAI API --- README.md | 23 ++++++++------- browser/browser.py | 70 +++++++++++++++++++++++++++++++--------------- server.py | 3 -- 3 files changed, 61 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 1f8ff6b2e..6c61c4511 100644 --- a/README.md +++ b/README.md @@ -181,16 +181,6 @@ This mode generates text based on an input prompt. python3 torchchat.py generate llama3.1 --prompt "write me a story about a boy and his bear" ``` -### Browser -This mode allows you to chat with the model using a UI in your browser -Running the command automatically open a tab in your browser. - -[skip default]: begin - -``` -streamlit run torchchat.py -- browser llama3.1 -``` - [skip default]: end ### Server @@ -252,6 +242,19 @@ curl http://127.0.0.1:5000/v1/chat \ +### Browser +This command opens a basic browser interface for local chat by querying a local server. + +First, follow the steps in the Server section above to start a local server. Then, in another terminal, launch the interface. Running the following will open a tab in your browser. + +[skip default]: begin + +``` +streamlit run browser/browser.py +``` + +Use the "Max Response Tokens" slider to limit the maximum number of tokens generated by the model for each response. Click the "Reset Chat" button to remove the message history and start a fresh chat. + ## Desktop/Server Execution diff --git a/browser/browser.py b/browser/browser.py index e702c3539..b858a87eb 100644 --- a/browser/browser.py +++ b/browser/browser.py @@ -1,40 +1,66 @@ import streamlit as st from openai import OpenAI +st.title("torchchat") + +start_state = [ + { + "role": "system", + "content": "You're an assistant. Answer questions directly, be brief, and have fun.", + }, + {"role": "assistant", "content": "How can I help you?"}, +] + with st.sidebar: - openai_api_key = st.text_input( - "OpenAI API Key", key="chatbot_api_key", type="password" + response_max_tokens = st.slider( + "Max Response Tokens", min_value=10, max_value=1000, value=250, step=10 ) - "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)" - "[View the source code](https://github.com/streamlit/llm-examples/blob/main/Chatbot.py)" - "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)" - -st.title("💬 Chatbot") + if st.button("Reset Chat", type="primary"): + st.session_state["messages"] = start_state if "messages" not in st.session_state: - st.session_state["messages"] = [ - { - "role": "system", - "content": "You're an assistant. Be brief, no yapping. Use as few words as possible to respond to the users' questions.", - }, - {"role": "assistant", "content": "How can I help you?"}, - ] + st.session_state["messages"] = start_state + for msg in st.session_state.messages: st.chat_message(msg["role"]).write(msg["content"]) if prompt := st.chat_input(): client = OpenAI( - # This is the default and can be omitted base_url="http://127.0.0.1:5000/v1", - api_key="YOURMOTHER", + api_key="813", # The OpenAI API requires an API key, but since we don't consume it, this can be any non-empty string. ) st.session_state.messages.append({"role": "user", "content": prompt}) st.chat_message("user").write(prompt) - response = client.chat.completions.create( - model="stories15m", messages=st.session_state.messages, max_tokens=64 - ) - msg = response.choices[0].message.content - st.session_state.messages.append({"role": "assistant", "content": msg}) - st.chat_message("assistant").write(msg) + + with st.chat_message("assistant"), st.status( + "Generating... ", expanded=True + ) as status: + + def get_streamed_completion(completion_generator): + start = time.time() + tokcount = 0 + for chunk in completion_generator: + tokcount += 1 + yield chunk.choices[0].delta.content + + status.update( + label="Done, averaged {:.2f} tokens/second".format( + tokcount / (time.time() - start) + ), + state="complete", + ) + + response = st.write_stream( + get_streamed_completion( + client.chat.completions.create( + model="llama3", + messages=st.session_state.messages, + max_tokens=response_max_tokens, + stream=True, + ) + ) + )[0] + + st.session_state.messages.append({"role": "assistant", "content": response}) diff --git a/server.py b/server.py index f1dbbcdc9..ad3f69ef4 100644 --- a/server.py +++ b/server.py @@ -75,9 +75,6 @@ def chunk_processor(chunked_completion_generator): next_tok = "" print(next_tok, end="", flush=True) yield f"data:{json.dumps(_del_none(asdict(chunk)))}\n\n" - # wasda = json.dumps(asdict(chunk)) - # print(wasda) - # yield wasda resp = Response( chunk_processor(gen.chunked_completion(req)),