From bc8b94217415a0dff07f7b18a88ac46e7ac0bf4a Mon Sep 17 00:00:00 2001 From: Paolo Rechia Date: Tue, 6 Jun 2023 22:52:38 +0200 Subject: [PATCH 1/3] Guidance extension --- extensions/guidance/guidance_server.py | 64 ++++++++++++++++++++++++++ extensions/guidance/requirements.txt | 1 + extensions/guidance/script.py | 5 ++ modules/shared.py | 12 ++++- requirements.txt | 1 + server.py | 16 +++++++ 6 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 extensions/guidance/guidance_server.py create mode 100644 extensions/guidance/requirements.txt create mode 100644 extensions/guidance/script.py diff --git a/extensions/guidance/guidance_server.py b/extensions/guidance/guidance_server.py new file mode 100644 index 0000000000..68ea13b44b --- /dev/null +++ b/extensions/guidance/guidance_server.py @@ -0,0 +1,64 @@ +import json +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from threading import Thread + +import guidance +from modules import shared + +class Handler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/api/v1/model': + self.send_response(200) + self.end_headers() + response = json.dumps({ + 'result': shared.model_name + }) + + self.wfile.write(response.encode('utf-8')) + else: + self.send_error(404) + + def do_POST(self): + content_length = int(self.headers['Content-Length']) + body = json.loads(self.rfile.read(content_length).decode('utf-8')) + + if self.path == '/guidance_api/v1/generate': + # TODO: add request validation + # For now disabled to avoid an extra dependency on validation libraries, like Pydantic + + prompt_template = body["prompt_template"] + input_vars = body["input_vars"] + kwargs = body["guidance_kwargs"] + output_vars = body["output_vars"] + + guidance_program = guidance(prompt_template) + program_result = guidance_program( + **kwargs, + stream=False, + async_mode=False, + caching=False, + **input_vars, + llm=shared.guidance_model, + ) + output = {"__main__": str(program_result)} + for output_var in output_vars: + output[output_var] = program_result[output_var] + + response = json.dumps(output) + self.wfile.write(response.encode('utf-8')) + + +def _run_server(port: int): + address = '0.0.0.0' if shared.args.listen else '127.0.0.1' + + server = ThreadingHTTPServer((address, port), Handler) + print(f'Starting API at http://{address}:{port}/api') + + server.serve_forever() + + +def start_server(port: int): + if not shared.guidance_model: + raise ValueError("Guidance model was not properly initialized. Cannot start guidance extension.") + + Thread(target=_run_server, args=[port], daemon=True).start() diff --git a/extensions/guidance/requirements.txt b/extensions/guidance/requirements.txt new file mode 100644 index 0000000000..07d0c54f45 --- /dev/null +++ b/extensions/guidance/requirements.txt @@ -0,0 +1 @@ +guidance \ No newline at end of file diff --git a/extensions/guidance/script.py b/extensions/guidance/script.py new file mode 100644 index 0000000000..404d01da4b --- /dev/null +++ b/extensions/guidance/script.py @@ -0,0 +1,5 @@ +from extensions.guidance import guidance_server +from modules import shared + +def setup(): + guidance_server.start_server(shared.args.guidance_port) diff --git a/modules/shared.py b/modules/shared.py index 9f4f720c68..d99777932a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -11,6 +11,7 @@ tokenizer = None model_name = "None" model_type = None +guidance_model = None lora_names = [] # Chat variables @@ -174,6 +175,10 @@ def str2bool(v): parser.add_argument('--api-streaming-port', type=int, default=5005, help='The listening port for the streaming API.') parser.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.') +# Guidance Server +parser.add_argument('--guidance', action='store_true', help='Enable the guidance API extension.') +parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the blocking guidance API.') + # Multimodal parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') @@ -206,11 +211,12 @@ def add_extension(name): if args.multimodal_pipeline is not None: add_extension('multimodal') +if args.guidance: + add_extension("guidance_server") def is_chat(): return args.chat - # Loading model-specific settings with Path(f'{args.model_dir}/config.yaml') as p: if p.exists(): @@ -229,3 +235,7 @@ def is_chat(): model_config[k] = user_config[k] model_config = OrderedDict(model_config) + + +def use_guidance(): + return args.guidance diff --git a/requirements.txt b/requirements.txt index 9fdab32856..13a6ba8a02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ llama-cpp-python==0.1.57; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.57/llama_cpp_python-0.1.57-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" +guidance \ No newline at end of file diff --git a/server.py b/server.py index 5e770d1b45..989ab30c78 100644 --- a/server.py +++ b/server.py @@ -1088,6 +1088,22 @@ def create_interface(): 'instruction_template': shared.settings['instruction_template'] }) + # This extension depends on having the model already fully loaded, including LoRA + if shared.use_guidance(): + try: + import guidance + except ImportError: + raise ImportError("Please run 'pip install guidance' before using the guidance extension.") + + # For now only supports HF Transformers + # As far as I know, this includes GPTQ variants and LoRAs + shared.guidance_model = guidance.llms.Transformers( + model=shared.model, + tokenizer=shared.tokenizer, + device="auto" + ) + + shared.generation_lock = Lock() # Launch the web UI create_interface() From c8b830629d5acb05de24705e11493dbf11e36349 Mon Sep 17 00:00:00 2001 From: Paolo Rechia Date: Tue, 6 Jun 2023 23:51:16 +0200 Subject: [PATCH 2/3] First working version of extension --- extensions/guidance/guidance_server.py | 77 ++++++++++++++++++++------ modules/shared.py | 7 +-- requirements.txt | 3 +- server.py | 15 ----- 4 files changed, 65 insertions(+), 37 deletions(-) diff --git a/extensions/guidance/guidance_server.py b/extensions/guidance/guidance_server.py index 68ea13b44b..0f10082fd4 100644 --- a/extensions/guidance/guidance_server.py +++ b/extensions/guidance/guidance_server.py @@ -1,3 +1,27 @@ +"""Loads model into the guidance library (https://github.com/microsoft/guidance). +It aims to reduce the entry barrier of using the guidance library with quantized models + +The easiest way to get started with this extension is by using the Python client wrapper: + +https://github.com/ChuloAI/andromeda-chain + +Example: + +``` +from andromeda_chain import AndromedaChain, AndromedaPrompt, AndromedaResponse +chain = AndromedaChain("http://0.0.0.0:9000/guidance_api/v1/generate") + +prompt = AndromedaPrompt( + name="hello", + prompt_template="Howdy: {{gen 'expert_names' temperature=0 max_tokens=300}}", + input_vars=[], + output_vars=["expert_names"] +) + +response: AndromedaResponse = chain.run_guidance_prompt(prompt) +``` + +""" import json from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from threading import Thread @@ -5,19 +29,38 @@ import guidance from modules import shared -class Handler(BaseHTTPRequestHandler): - def do_GET(self): - if self.path == '/api/v1/model': - self.send_response(200) - self.end_headers() - response = json.dumps({ - 'result': shared.model_name - }) +# This extension depends on having the model already fully loaded, including LoRA + +guidance_model = None + +def load_guidance_model_singleton(): + global guidance_model + + if guidance_model: + return guidance_model + try: + import guidance + except ImportError: + raise ImportError("Please run 'pip install guidance' before using the guidance extension.") + + if not shared.model or not shared.tokenizer: + raise ValueError("Cannot use guidance extension without a pre-initialized model!") + + # For now only supports HF Transformers + # As far as I know, this includes: + # - 8 and 4 bits quantizations loaded through bitsandbytes + # - GPTQ variants + # - Models with LoRAs + + guidance_model = guidance.llms.Transformers( + model=shared.model, + tokenizer=shared.tokenizer, + device=shared.args.guidance_device + ) + guidance.llm = guidance_model - self.wfile.write(response.encode('utf-8')) - else: - self.send_error(404) +class Handler(BaseHTTPRequestHandler): def do_POST(self): content_length = int(self.headers['Content-Length']) body = json.loads(self.rfile.read(content_length).decode('utf-8')) @@ -31,6 +74,7 @@ def do_POST(self): kwargs = body["guidance_kwargs"] output_vars = body["output_vars"] + llm = load_guidance_model_singleton() guidance_program = guidance(prompt_template) program_result = guidance_program( **kwargs, @@ -38,13 +82,17 @@ def do_POST(self): async_mode=False, caching=False, **input_vars, - llm=shared.guidance_model, + llm=llm, ) output = {"__main__": str(program_result)} for output_var in output_vars: output[output_var] = program_result[output_var] + response = json.dumps(output) + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() self.wfile.write(response.encode('utf-8')) @@ -52,13 +100,10 @@ def _run_server(port: int): address = '0.0.0.0' if shared.args.listen else '127.0.0.1' server = ThreadingHTTPServer((address, port), Handler) - print(f'Starting API at http://{address}:{port}/api') + print(f'Starting Guidance API at http://{address}:{port}/guidance_api') server.serve_forever() def start_server(port: int): - if not shared.guidance_model: - raise ValueError("Guidance model was not properly initialized. Cannot start guidance extension.") - Thread(target=_run_server, args=[port], daemon=True).start() diff --git a/modules/shared.py b/modules/shared.py index d99777932a..39998ab979 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -178,6 +178,8 @@ def str2bool(v): # Guidance Server parser.add_argument('--guidance', action='store_true', help='Enable the guidance API extension.') parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the blocking guidance API.') +parser.add_argument('--guidance-device', type=str, default='cuda', help='The listening port for the blocking guidance API.') + # Multimodal parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') @@ -212,7 +214,7 @@ def add_extension(name): add_extension('multimodal') if args.guidance: - add_extension("guidance_server") + add_extension("guidance") def is_chat(): return args.chat @@ -236,6 +238,3 @@ def is_chat(): model_config = OrderedDict(model_config) - -def use_guidance(): - return args.guidance diff --git a/requirements.txt b/requirements.txt index 13a6ba8a02..a14d42a1e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,5 +22,4 @@ https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39 llama-cpp-python==0.1.57; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.57/llama_cpp_python-0.1.57-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" -guidance \ No newline at end of file +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" \ No newline at end of file diff --git a/server.py b/server.py index 989ab30c78..fe2bc4a5a0 100644 --- a/server.py +++ b/server.py @@ -1088,21 +1088,6 @@ def create_interface(): 'instruction_template': shared.settings['instruction_template'] }) - # This extension depends on having the model already fully loaded, including LoRA - if shared.use_guidance(): - try: - import guidance - except ImportError: - raise ImportError("Please run 'pip install guidance' before using the guidance extension.") - - # For now only supports HF Transformers - # As far as I know, this includes GPTQ variants and LoRAs - shared.guidance_model = guidance.llms.Transformers( - model=shared.model, - tokenizer=shared.tokenizer, - device="auto" - ) - shared.generation_lock = Lock() # Launch the web UI From f6f3d943514507475a58959598bb3f48485743fa Mon Sep 17 00:00:00 2001 From: Paolo Rechia Date: Sun, 11 Jun 2023 16:31:37 +0200 Subject: [PATCH 3/3] Fix description --- modules/shared.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 39998ab979..10b69d41f3 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -177,8 +177,8 @@ def str2bool(v): # Guidance Server parser.add_argument('--guidance', action='store_true', help='Enable the guidance API extension.') -parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the blocking guidance API.') -parser.add_argument('--guidance-device', type=str, default='cuda', help='The listening port for the blocking guidance API.') +parser.add_argument('--guidance-port', type=int, default=9000, help='The listening port for the guidance API.') +parser.add_argument('--guidance-device', type=str, default='cuda', help='The device where the model is loaded on.') # Multimodal