From bc8b94217415a0dff07f7b18a88ac46e7ac0bf4a Mon Sep 17 00:00:00 2001
From: Paolo Rechia <paolorechia@gmail.com>
Date: Tue, 6 Jun 2023 22:52:38 +0200
Subject: [PATCH 1/3] Guidance extension

---
 extensions/guidance/guidance_server.py | 64 ++++++++++++++++++++++++++
 extensions/guidance/requirements.txt   |  1 +
 extensions/guidance/script.py          |  5 ++
 modules/shared.py                      | 12 ++++-
 requirements.txt                       |  1 +
 server.py                              | 16 +++++++
 6 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 extensions/guidance/guidance_server.py
 create mode 100644 extensions/guidance/requirements.txt
 create mode 100644 extensions/guidance/script.py

diff --git a/extensions/guidance/guidance_server.py b/extensions/guidance/guidance_server.py
new file mode 100644
index 0000000000..68ea13b44b
--- /dev/null
+++ b/extensions/guidance/guidance_server.py
@@ -0,0 +1,64 @@
+import json
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from threading import Thread
+
+import guidance
+from modules import shared
+
+class Handler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == '/api/v1/model':
+            self.send_response(200)
+            self.end_headers()
+            response = json.dumps({
+                'result': shared.model_name
+            })
+
+            self.wfile.write(response.encode('utf-8'))
+        else:
+            self.send_error(404)
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'])
+        body = json.loads(self.rfile.read(content_length).decode('utf-8'))
+
+        if self.path == '/guidance_api/v1/generate':
+            # TODO: add request validation
+            # For now disabled to avoid an extra dependency on validation libraries, like Pydantic
+
+            prompt_template = body["prompt_template"]
+            input_vars = body["input_vars"]
+            kwargs = body["guidance_kwargs"]
+            output_vars = body["output_vars"]
+
+            guidance_program = guidance(prompt_template)
+            program_result = guidance_program(
+                **kwargs,
+                stream=False,
+                async_mode=False,
+                caching=False,
+                **input_vars,
+                llm=shared.guidance_model,
+            )
+            output = {"__main__": str(program_result)}
+            for output_var in output_vars:
+                output[output_var] = program_result[output_var]
+            
+            response = json.dumps(output)
+            self.wfile.write(response.encode('utf-8'))
+
+
+def _run_server(port: int):
+    address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
+
+    server = ThreadingHTTPServer((address, port), Handler)
+    print(f'Starting API at http://{address}:{port}/api')
+
+    server.serve_forever()
+
+
+def start_server(port: int):
+    if not shared.guidance_model:
+        raise ValueError("Guidance model was not properly initialized. Cannot start guidance extension.")
+
+    Thread(target=_run_server, args=[port], daemon=True).start()
diff --git a/extensions/guidance/requirements.txt b/extensions/guidance/requirements.txt
new file mode 100644
index 0000000000..07d0c54f45
--- /dev/null
+++ b/extensions/guidance/requirements.txt
@@ -0,0 +1 @@
+guidance
\ No newline at end of file
diff --git a/extensions/guidance/script.py b/extensions/guidance/script.py
new file mode 100644
index 0000000000..404d01da4b
--- /dev/null
+++ b/extensions/guidance/script.py
@@ -0,0 +1,5 @@
+from extensions.guidance import guidance_server
+from modules import shared
+
+def setup():
+    guidance_server.start_server(shared.args.guidance_port)
diff --git a/modules/shared.py b/modules/shared.py
index 9f4f720c68..d99777932a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -11,6 +11,7 @@
 tokenizer = None
 model_name = "None"
 model_type = None
+guidance_model = None
 lora_names = []
 
 # Chat variables
@@ -174,6 +175,10 @@ def str2bool(v):
 parser.add_argument('--api-streaming-port', type=int,  default=5005, help='The listening port for the streaming API.')
 parser.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
 
+# Guidance Server
+parser.add_argument('--guidance', action='store_true', help='Enable the guidance API extension.')
+parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the blocking guidance API.')
+
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
 
@@ -206,11 +211,12 @@ def add_extension(name):
 if args.multimodal_pipeline is not None:
     add_extension('multimodal')
 
+if args.guidance:
+    add_extension("guidance_server")
 
 def is_chat():
     return args.chat
 
-
 # Loading model-specific settings
 with Path(f'{args.model_dir}/config.yaml') as p:
     if p.exists():
@@ -229,3 +235,7 @@ def is_chat():
                 model_config[k] = user_config[k]
 
 model_config = OrderedDict(model_config)
+
+
+def use_guidance():
+    return args.guidance
diff --git a/requirements.txt b/requirements.txt
index 9fdab32856..13a6ba8a02 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,4 @@ llama-cpp-python==0.1.57; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.57/llama_cpp_python-0.1.57-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux"
+guidance
\ No newline at end of file
diff --git a/server.py b/server.py
index 5e770d1b45..989ab30c78 100644
--- a/server.py
+++ b/server.py
@@ -1088,6 +1088,22 @@ def create_interface():
             'instruction_template': shared.settings['instruction_template']
         })
 
+    # This extension depends on having the model already fully loaded, including LoRA
+    if shared.use_guidance():
+        try:
+            import guidance
+        except ImportError:
+            raise ImportError("Please run 'pip install guidance' before using the guidance extension.")
+        
+        # For now only supports HF Transformers
+        # As far as I know, this includes GPTQ variants and LoRAs
+        shared.guidance_model = guidance.llms.Transformers(
+            model=shared.model,
+            tokenizer=shared.tokenizer,
+            device="auto"
+        )
+
+
     shared.generation_lock = Lock()
     # Launch the web UI
     create_interface()

From c8b830629d5acb05de24705e11493dbf11e36349 Mon Sep 17 00:00:00 2001
From: Paolo Rechia <paolorechia@gmail.com>
Date: Tue, 6 Jun 2023 23:51:16 +0200
Subject: [PATCH 2/3] First working version of extension

---
 extensions/guidance/guidance_server.py | 77 ++++++++++++++++++++------
 modules/shared.py                      |  7 +--
 requirements.txt                       |  3 +-
 server.py                              | 15 -----
 4 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/extensions/guidance/guidance_server.py b/extensions/guidance/guidance_server.py
index 68ea13b44b..0f10082fd4 100644
--- a/extensions/guidance/guidance_server.py
+++ b/extensions/guidance/guidance_server.py
@@ -1,3 +1,27 @@
+"""Loads model into the guidance library (https://github.com/microsoft/guidance).
+It aims to reduce the entry barrier of using the guidance library with quantized models  
+
+The easiest way to get started with this extension is by using the Python client wrapper:
+
+https://github.com/ChuloAI/andromeda-chain
+
+Example:
+
+```
+from andromeda_chain import AndromedaChain, AndromedaPrompt, AndromedaResponse
+chain = AndromedaChain("http://0.0.0.0:9000/guidance_api/v1/generate")
+
+prompt = AndromedaPrompt(
+    name="hello",
+    prompt_template="Howdy: {{gen 'expert_names' temperature=0 max_tokens=300}}",
+    input_vars=[],
+    output_vars=["expert_names"]
+)
+
+response: AndromedaResponse = chain.run_guidance_prompt(prompt)
+```
+
+"""
 import json
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from threading import Thread
@@ -5,19 +29,38 @@
 import guidance
 from modules import shared
 
-class Handler(BaseHTTPRequestHandler):
-    def do_GET(self):
-        if self.path == '/api/v1/model':
-            self.send_response(200)
-            self.end_headers()
-            response = json.dumps({
-                'result': shared.model_name
-            })
+# This extension depends on having the model already fully loaded, including LoRA
+
+guidance_model = None
+
+def load_guidance_model_singleton():
+    global guidance_model
+
+    if guidance_model:
+        return guidance_model
+    try:
+        import guidance
+    except ImportError:
+        raise ImportError("Please run 'pip install guidance' before using the guidance extension.")
+    
+    if not shared.model or not shared.tokenizer:
+        raise ValueError("Cannot use guidance extension without a pre-initialized model!")
+    
+    # For now only supports HF Transformers
+    # As far as I know, this includes:
+    #  - 8 and 4 bits quantizations loaded through bitsandbytes
+    #  - GPTQ variants
+    #  - Models with LoRAs
+
+    guidance_model = guidance.llms.Transformers(
+        model=shared.model,
+        tokenizer=shared.tokenizer,
+        device=shared.args.guidance_device
+    )
+    guidance.llm = guidance_model
 
-            self.wfile.write(response.encode('utf-8'))
-        else:
-            self.send_error(404)
 
+class Handler(BaseHTTPRequestHandler):
     def do_POST(self):
         content_length = int(self.headers['Content-Length'])
         body = json.loads(self.rfile.read(content_length).decode('utf-8'))
@@ -31,6 +74,7 @@ def do_POST(self):
             kwargs = body["guidance_kwargs"]
             output_vars = body["output_vars"]
 
+            llm = load_guidance_model_singleton()
             guidance_program = guidance(prompt_template)
             program_result = guidance_program(
                 **kwargs,
@@ -38,13 +82,17 @@ def do_POST(self):
                 async_mode=False,
                 caching=False,
                 **input_vars,
-                llm=shared.guidance_model,
+                llm=llm,
             )
             output = {"__main__": str(program_result)}
             for output_var in output_vars:
                 output[output_var] = program_result[output_var]
             
+
             response = json.dumps(output)
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/json')
+            self.end_headers()
             self.wfile.write(response.encode('utf-8'))
 
 
@@ -52,13 +100,10 @@ def _run_server(port: int):
     address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
 
     server = ThreadingHTTPServer((address, port), Handler)
-    print(f'Starting API at http://{address}:{port}/api')
+    print(f'Starting Guidance API at http://{address}:{port}/guidance_api')
 
     server.serve_forever()
 
 
 def start_server(port: int):
-    if not shared.guidance_model:
-        raise ValueError("Guidance model was not properly initialized. Cannot start guidance extension.")
-
     Thread(target=_run_server, args=[port], daemon=True).start()
diff --git a/modules/shared.py b/modules/shared.py
index d99777932a..39998ab979 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -178,6 +178,8 @@ def str2bool(v):
 # Guidance Server
 parser.add_argument('--guidance', action='store_true', help='Enable the guidance API extension.')
 parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the blocking guidance API.')
+parser.add_argument('--guidance-device', type=str, default='cuda', help='The listening port for the blocking guidance API.')
+
 
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
@@ -212,7 +214,7 @@ def add_extension(name):
     add_extension('multimodal')
 
 if args.guidance:
-    add_extension("guidance_server")
+    add_extension("guidance")
 
 def is_chat():
     return args.chat
@@ -236,6 +238,3 @@ def is_chat():
 
 model_config = OrderedDict(model_config)
 
-
-def use_guidance():
-    return args.guidance
diff --git a/requirements.txt b/requirements.txt
index 13a6ba8a02..a14d42a1e2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,5 +22,4 @@ https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39
 llama-cpp-python==0.1.57; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.57/llama_cpp_python-0.1.57-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux"
-guidance
\ No newline at end of file
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux"
\ No newline at end of file
diff --git a/server.py b/server.py
index 989ab30c78..fe2bc4a5a0 100644
--- a/server.py
+++ b/server.py
@@ -1088,21 +1088,6 @@ def create_interface():
             'instruction_template': shared.settings['instruction_template']
         })
 
-    # This extension depends on having the model already fully loaded, including LoRA
-    if shared.use_guidance():
-        try:
-            import guidance
-        except ImportError:
-            raise ImportError("Please run 'pip install guidance' before using the guidance extension.")
-        
-        # For now only supports HF Transformers
-        # As far as I know, this includes GPTQ variants and LoRAs
-        shared.guidance_model = guidance.llms.Transformers(
-            model=shared.model,
-            tokenizer=shared.tokenizer,
-            device="auto"
-        )
-
 
     shared.generation_lock = Lock()
     # Launch the web UI

From f6f3d943514507475a58959598bb3f48485743fa Mon Sep 17 00:00:00 2001
From: Paolo Rechia <paolorechia@gmail.com>
Date: Sun, 11 Jun 2023 16:31:37 +0200
Subject: [PATCH 3/3] Fix description

---
 modules/shared.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 39998ab979..10b69d41f3 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -177,8 +177,8 @@ def str2bool(v):
 
 # Guidance Server
 parser.add_argument('--guidance', action='store_true', help='Enable the guidance API extension.')
-parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the blocking guidance API.')
-parser.add_argument('--guidance-device', type=str, default='cuda', help='The listening port for the blocking guidance API.')
+parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the guidance API.')
+parser.add_argument('--guidance-device', type=str, default='cuda', help='The device where the model is loaded on.')
 
 
 # Multimodal