From 2e8ceadecf9eb6fe31fb11ae6378e2fac1264de6 Mon Sep 17 00:00:00 2001
From: c0sogi <121936784+c0sogi@users.noreply.github.com>
Date: Mon, 4 Sep 2023 11:25:10 +0900
Subject: [PATCH] Update readme.md

---
 readme.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 9 deletions(-)

diff --git a/readme.md b/readme.md
index 7dcc74e..2219f53 100644
--- a/readme.md
+++ b/readme.md
@@ -32,7 +32,11 @@ python -m main
 ```
 Options:
 ```b
-usage: main.py [-h] [--port PORT] [--max-workers MAX_WORKERS] [--max-semaphores MAX_SEMAPHORES] [--api-key API_KEY] [--xformers] [--no-embed] [--tunnel] [--install-pkgs] [--force-cuda] [--skip-torch-install] [--skip-tf-install] [--skip-compile]
+usage: main.py [-h] [--port PORT] [--max-workers MAX_WORKERS]
+               [--max-semaphores MAX_SEMAPHORES]
+               [--max-tokens-limit MAX_TOKENS_LIMIT] [--api-key API_KEY]
+               [--no-embed] [--tunnel] [--install-pkgs] [--force-cuda]
+               [--skip-torch-install] [--skip-tf-install] [--skip-compile]
                [--no-cache-dir] [--upgrade]
 
 options:
@@ -41,22 +45,31 @@ options:
   --max-workers MAX_WORKERS, -w MAX_WORKERS
                         Maximum number of process workers to run; default is 1
   --max-semaphores MAX_SEMAPHORES, -s MAX_SEMAPHORES
-                        Maximum number of process semaphores to permit; default is 1
+                        Maximum number of process semaphores to permit;
+                        default is 1
+  --max-tokens-limit MAX_TOKENS_LIMIT, -l MAX_TOKENS_LIMIT
+                        Set the maximum number of tokens to `max_tokens`. This
+                        is needed to limit the number of tokens
+                        generated.Default is None, which means no limit.        
   --api-key API_KEY, -k API_KEY
                         API key to use for the server
   --no-embed            Disable embeddings endpoint
   --tunnel, -t          Tunnel the server through cloudflared
-  --install-pkgs, -i    Install all required packages before running the server
-  --force-cuda, -c      Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118
+  --install-pkgs, -i    Install all required packages before running the        
+                        server
+  --force-cuda, -c      Force CUDA version of pytorch to be used when
+                        installing pytorch. e.g. torch==2.0.1+cu118
   --skip-torch-install, --no-torch
-                        Skip installing pytorch, if `install-pkgs` is set
+                        Skip installing pytorch, if `install-pkgs` is set       
   --skip-tf-install, --no-tf
-                        Skip installing tensorflow, if `install-pkgs` is set
+                        Skip installing tensorflow, if `install-pkgs` is set    
   --skip-compile, --no-compile
-                        Skip compiling the shared library of LLaMA C++ code
+                        Skip compiling the shared library of LLaMA C++ code     
   --no-cache-dir, --no-cache
-                        Disable caching of pip installs, if `install-pkgs` is set
-  --upgrade, -u         Upgrade all packages and repositories before running the server
+                        Disable caching of pip installs, if `install-pkgs` is   
+                        set
+  --upgrade, -u         Upgrade all packages and repositories before running    
+                        the server
 ```
 
 ### Unique features
@@ -132,6 +145,61 @@ openai_replacement_models = {"gpt-3.5-turbo": "my_ggml", "gpt-4": "my_gptq2"}
 ```
 The RoPE frequency and scaling factor will be automatically calculated and set if you don't set them in the model definition. Assuming that you are using Llama2 model.
 
+Certainly, you can use the following Markdown template for your GitHub README to explain how to use Langchain with custom models. This guide will include information on defining the `my_model_def.py` and using it in `langchain_test.py`.
+
+## Usage: Langchain integration
+
+Langchain allows you to incorporate custom language models seamlessly. This guide will walk you through setting up your own custom model, replacing OpenAI models, and running text or chat completions.
+
+1. Defining Your Custom Model
+
+First, you need to define your custom language model in a Python file, for instance, `my_model_def.py`. This file should include the definition of your custom model.
+
+```python
+# my_model_def.py
+from llama_api.schemas.models import LlamaCppModel, ExllamaModel
+
+mythomax_l2_13b_gptq = ExllamaModel(
+    model_path="TheBloke/MythoMax-L2-13B-GPTQ",  # automatic download
+    max_total_tokens=4096,
+)
+```
+
+In the example above, we've defined a custom model named `mythomax_l2_13b_gptq` using the `ExllamaModel` class.
+
+2. Replacing OpenAI Models
+
+You can replace an OpenAI model with your custom model using the `openai_replacement_models` dictionary. Add your custom model to this dictionary in the `my_model_def.py` file.
+
+```python
+# my_model_def.py (Continued)
+openai_replacement_models = {"gpt-3.5-turbo": "mythomax_l2_13b_gptq"}
+```
+
+Here, we replaced the `gpt-3.5-turbo` model with our custom `mythomax_l2_13b_gptq` model.
+
+3. Running Text/Chat Completions
+
+Finally, you can utilize your custom model in Langchain for performing text and chat completions.
+
+```python
+# langchain_test.py
+from langchain.chat_models import ChatOpenAI
+from os import environ
+
+environ["OPENAI_API_KEY"] = "Bearer foo"
+
+chat_model = ChatOpenAI(
+    model="gpt-3.5-turbo",
+    openai_api_base="http://localhost:8000/v1",
+)
+print(chat_model.predict("hi!"))
+```
+
+Now, running the `langchain_test.py` file will make use of your custom model for completions.
+
+That's it! You've successfully integrated a custom model into Langchain. Enjoy your enhanced text and chat completions!
+
 ## Usage: Text Completion
 Now, you can send a request to the server.