diff --git a/SUPPORT.md b/SUPPORT.md deleted file mode 100644 index 7e722035..00000000 --- a/SUPPORT.md +++ /dev/null @@ -1,16 +0,0 @@ -# Support - -## How to file issues and get help - -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. - -You may use [GitHub Issues](https://github.com/microsoft/UFO/issues) to raise questions, bug reports, and feature requests. - -For help and questions about using this project, please please contact [ufo-agent@microsoft.com](mailto:ufo-agent@microsoft.com). - - -## Microsoft Support Policy - -Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/instantiation/.gitignore b/instantiation/.gitignore new file mode 100644 index 00000000..9da01687 --- /dev/null +++ b/instantiation/.gitignore @@ -0,0 +1,9 @@ +# Ignore files +cache/ +controls_cache/ +tasks/* +!tasks/prefill +templates/word/* +logs/* +controller/utils/ +config/config.yaml diff --git a/instantiation/README.md b/instantiation/README.md new file mode 100644 index 00000000..54ceb6d0 --- /dev/null +++ b/instantiation/README.md @@ -0,0 +1,219 @@ +## Introduction of Instantiation + +**The instantiation process aims to filter and modify instructions according to the current environment.** + +By using this process, we can obtain clearer and more specific instructions, making them more suitable for the execution of the UFO. + +## How to Use + +### 1. Install Packages + +You should install the necessary packages in the UFO root folder: + +```bash +pip install -r requirements.txt +``` + +### 2. Configure the LLMs + +Before using the instantiation section, you need to provide your LLM configurations in `config.yaml` and `config_dev.yaml` located in the `instantiation/config` folder. + +- `config_dev.yaml` specifies the paths of relevant files and contains default settings. The match strategy for the control filter supports options: `'contains'`, `'fuzzy'`, and `'regex'`, allowing flexible matching between application windows and target files. + +- `config.yaml` stores the agent information. You should copy the `config.yaml.template` file and fill it out according to the provided hints. + +You will configure the prefill agent and the filter agent individually. The prefill agent is used to prepare the task, while the filter agent evaluates the quality of the prefilled task. You can choose different LLMs for each. + +**BE CAREFUL!** If you are using GitHub or other open-source tools, do not expose your `config.yaml` online, as it contains your private keys. + +Once you have filled out the template, rename it to `config.yaml` to complete the LLM configuration. + +### 3. Prepare Files + +Certain files need to be prepared before running the task. + +#### 3.1. Tasks as JSON + +The tasks that need to be instantiated should be organized in a folder of JSON files, with the default folder path set to `instantiation/tasks`. This path can be changed in the `instantiation/config/config.yaml` file, or you can specify it in the terminal, as mentioned in **4. Start Running**. For example, a task stored in `instantiation/tasks/prefill/` may look like this: + +```json +{ + // The app you want to use + "app": "word", + // A unique ID to distinguish different tasks + "unique_id": "1", + // The task and steps to be instantiated + "task": "Type 'hello' and set the font type to Arial", + "refined_steps": [ + "Type 'hello'", + "Set the font to Arial" + ] +} +``` + +#### 3.2. Templates and Descriptions + +You should place an app file as a reference for instantiation in a folder named after the app. + +For example, if you have `template1.docx` for Word, it should be located at `instantiation/templates/word/template1.docx`. + +Additionally, for each app folder, there should be a `description.json` file located at `instantiation/templates/word/description.json`, which describes each template file in detail. It may look like this: + +```json +{ + "template1.docx": "A document with a rectangle shape", + "template2.docx": "A document with a line of text", + "template3.docx": "A document with a chart" +} +``` + +If a `description.json` file is not present, one template file will be selected at random. + +#### 3.3. Final Structure + +Ensure the following files are in place: + +- [X] JSON files to be instantiated +- [X] Templates as references for instantiation +- [X] Description file in JSON format + +The structure of the files can be: + +```bash +instantiation/ +| +├── tasks/ +│ ├── action_prefill/ +│ │ ├── task1.json +│ │ ├── task2.json +│ │ └── task3.json +│ └── ... +| +├── templates/ +│ ├── word/ +│ │ ├── template1.docx +│ │ ├── template2.docx +│ │ ├── template3.docx +│ │ └── description.json +│ └── ... +└── ... +``` + +### 4. Start Running + +Run the `instantiation/action_prefill.py` file in module mode. You can do this by typing the following command in the terminal: + +```bash +python -m instantiation +``` + +You can use `--task` to specify the task folder you want to use; the default is `action_prefill`: + +```bash +python -m instantiation --task your_task_folder_name +``` + +After the process is completed, a new folder named `prefill_instantiated` will be created alongside the original one. This folder will contain the instantiated task, which will look like: + +```json +{ + // A unique ID to distinguish different tasks + "unique_id": "1", + // The chosen template path + "instantial_template_path": "copied template file path", + // The instantiated task and steps + "instantiated_request": "Type 'hello' and set the font type to Arial in the Word document.", + "instantiated_plan": [ + { + "step 1": "Select the target text 'text to edit'", + "controlLabel": "", + "controlText": "", + "function": "select_text", + "args": { + "text": "text to edit" + } + }, + { + "step 2": "Type 'hello'", + "controlLabel": "101", + "controlText": "Edit", + "function": "type_keys", + "args": { + "text": "hello" + } + }, + { + "step 3": "Select the typed text 'hello'", + "controlLabel": "", + "controlText": "", + "function": "select_text", + "args": { + "text": "hello" + } + }, + { + "step 4": "Click the font dropdown", + "controlLabel": "", + "controlText": "Consolas", + "function": "click_input", + "args": { + "button": "left", + "double": false + } + }, + { + "step 5": "Set the font to Arial", + "controlLabel": "", + "controlText": "Arial", + "function": "click_input", + "args": { + "button": "left", + "double": false + } + } + ], + "result": { + "filter": "Drawing or writing a signature using the drawing tools in the Word desktop app is a task that can be executed locally within the application." + }, + "execution_time": { + "choose_template": 10.650701761245728, + "prefill": 44.23913502693176, + "filter": 3.746831178665161, + "total": 58.63666796684265 + } +} +``` + +Additionally, a `prefill_templates` folder will be created, which stores the copied chosen templates for each task. + +## Workflow + +There are three key steps in the instantiation process: + +1. Choose a template file according to the specified app and instruction. +2. Prefill the task using the current screenshot. +3. Filter the established task. + +#### 1. Choose Template File + +Templates for your app must be defined and described in `instantiation/templates/app`. For instance, if you want to instantiate tasks for the Word application, place the relevant `.docx` files in `instantiation/templates/word`, along with a `description.json` file. + +The appropriate template will be selected based on how well its description matches the instruction. + +#### 2. Prefill the Task + +After selecting the template file, it will be opened, and a screenshot will be taken. If the template file is currently in use, errors may occur. + +The screenshot will be sent to the action prefill agent, which will return a modified task. + +#### 3. Filter Task + +The completed task will be evaluated by a filter agent, which will assess it and provide feedback. If the task is deemed a good instance, it will be saved in `instantiation/tasks/your_folder_name_instantiated/instances_pass/`; otherwise, it will be saved in `instantiation/tasks/your_folder_name_instantiated/instances_fail/`. + +All encountered error messages and tracebacks are saved in `instantiation/tasks/your_folder_name_instantiated/instances_error/`. + +## Notes + +1. Users should be careful to save the original files while using this project; otherwise, the files will be closed when the app is shut down. + +2. After starting the project, users should not close the app window while the program is taking screenshots. diff --git a/instantiation/__main__.py b/instantiation/__main__.py new file mode 100644 index 00000000..b0f9849d --- /dev/null +++ b/instantiation/__main__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from instantiation import instantiation + +if __name__ == "__main__": + # Execute the main script + instantiation.main() diff --git a/instantiation/config/config.py b/instantiation/config/config.py new file mode 100644 index 00000000..6f0bf046 --- /dev/null +++ b/instantiation/config/config.py @@ -0,0 +1,37 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from ufo.config.config import Config + + +class Config(Config): + _instance = None + + def __init__(self, config_path="instantiation/config/"): + """ + Initializes the Config class. + :param config_path: The path to the config file. + """ + self.config_data = self.load_config(config_path) + + @staticmethod + def get_instance(): + """ + Get the instance of the Config class. + :return: The instance of the Config class. + """ + if Config._instance is None: + Config._instance = Config() + + return Config._instance + + def optimize_configs(self, configs): + """ + Optimize the configurations. + :param configs: The configurations to optimize. + :return: The optimized configurations. + """ + self.update_api_base(configs, "PREFILL_AGENT") + self.update_api_base(configs, "FILTER_AGENT") + + return configs diff --git a/instantiation/config/config.yaml.template b/instantiation/config/config.yaml.template new file mode 100644 index 00000000..ecbac7e1 --- /dev/null +++ b/instantiation/config/config.yaml.template @@ -0,0 +1,43 @@ +# You will configure for the prefill agent and filter agent individualy. +# Prefill agent is used to prefill the task. +# Filter agent is to evaluate the prefill quality. + +PREFILL_AGENT: { + VISUAL_MODE: True, # Whether to use the visual mode + + API_TYPE: "azure_ad" , # The API type, "openai" for the OpenAI API, "aoai" for the AOAI API, 'azure_ad' for the ad authority of the AOAI API. + API_BASE: "https://cloudgpt-openai.azure-api.net/", # The the OpenAI API endpoint, "https://api.openai.com/v1/chat/completions" for the OpenAI API. As for the AAD, it should be your endpoints. + API_KEY: "YOUR_API_KEY", # The OpenAI API key + API_VERSION: "2024-02-15-preview", # "2024-02-15-preview" by default + API_MODEL: "gpt-4o-20240513", # The only OpenAI model by now that accepts visual input + + ###For the AOAI + API_DEPLOYMENT_ID: "gpt-4-0125-preview", # The deployment id for the AOAI API + ### For Azure_AD + AAD_TENANT_ID: "YOUR_AAD_ID", # Set the value to your tenant id for the llm model + AAD_API_SCOPE: "openai", # Set the value to your scope for the llm model + AAD_API_SCOPE_BASE: "YOUR_AAD_API_SCOPE_BASE" # Set the value to your scope base for the llm model, whose format is API://YOUR_SCOPE_BASE, and the only need is the YOUR_SCOPE_BASE +} + +FILTER_AGENT: { + VISUAL_MODE: False, # Whether to use the visual mode + + API_TYPE: "azure_ad" , # The API type, "openai" for the OpenAI API, "aoai" for the Azure OpenAI. + API_BASE: "https://cloudgpt-openai.azure-api.net/", # The the OpenAI API endpoint, "https://api.openai.com/v1/chat/completions" for the OpenAI API. As for the aoai, it should be https://{your-resource-name}.openai.azure.com + API_KEY: "YOUR_API_KEY", # The aoai API key + API_VERSION: "2024-04-01-preview", # "2024-02-15-preview" by default + API_MODEL: "gpt-4o-20240513", # The only OpenAI model by now that accepts visual input + API_DEPLOYMENT_ID: "gpt-4o-20240513-preview", # The deployment id for the AOAI API + + ### For Azure_AD + AAD_TENANT_ID: "YOUR_AAD_ID", + AAD_API_SCOPE: "openai", #"openai" + AAD_API_SCOPE_BASE: "YOUR_AAD_API_SCOPE_BASE", #API://YOUR_SCOPE_BASE +} + +# For parameters +MAX_TOKENS: 2000 # The max token limit for the response completion +MAX_RETRY: 3 # The max retry limit for the response completion +TEMPERATURE: 0.0 # The temperature of the model: the lower the value, the more consistent the output of the model +TOP_P: 0.0 # The top_p of the model: the lower the value, the more conservative the output of the model +TIMEOUT: 60 # The call timeout(s), default is 10 minss \ No newline at end of file diff --git a/instantiation/config/config_dev.yaml b/instantiation/config/config_dev.yaml new file mode 100644 index 00000000..7b5509e2 --- /dev/null +++ b/instantiation/config/config_dev.yaml @@ -0,0 +1,31 @@ +version: 0.1 + +AOAI_DEPLOYMENT: "gpt-4-visual-preview" # Your AOAI deployment if apply +API_VERSION: "2024-02-15-preview" # "2024-02-15-preview" by default. +OPENAI_API_MODEL: "gpt-4-0125-preview" # The only OpenAI model by now that accepts visual input + +CONTROL_BACKEND: "uia" # The backend for control action +CONTROL_LIST: ["Button", "Edit", "TabItem", "Document", "ListItem", "MenuItem", "ScrollBar", "TreeItem", "Hyperlink", "ComboBox", "RadioButton", "DataItem", "Spinner"] +PRINT_LOG: False # Whether to print the log +LOG_LEVEL: "INFO" # The log level +MATCH_STRATEGY: "regex" # The match strategy for the control filter, support 'contains', 'fuzzy', 'regex' + +PREFILL_PROMPT: "instantiation/controller/prompts/{mode}/prefill.yaml" # The prompt for the action prefill +FILTER_PROMPT: "instantiation/controller/prompts/{mode}/filter.yaml" # The prompt for the filter +PREFILL_EXAMPLE_PROMPT: "instantiation/controller/prompts/{mode}/prefill_example.yaml" # The prompt for the action prefill example +API_PROMPT: "ufo/prompts/share/lite/api.yaml" # The prompt for the API + +# Exploration Configuration +TASKS_HUB: "instantiation/tasks" # The tasks hub for the exploration +TEMPLATE_PATH: "instantiation/templates" # The template path for the exploration + +# For control filtering +CONTROL_FILTER_TYPE: [] # The list of control filter type, support 'TEXT', 'SEMANTIC', 'ICON' +CONTROL_FILTER_MODEL_SEMANTIC_NAME: "all-MiniLM-L6-v2" # The control filter model name of semantic similarity +CONTROL_EMBEDDING_CACHE_PATH: "instantiation/cache/" # The cache path for the control filter +CONTROL_FILTER_TOP_K_PLAN: 2 # The control filter effect on top k plans from UFO, default is 2 + +# log path +LOG_PATH: "instantiation/logs/{task}" +PREFILL_LOG_PATH: "instantiation/logs/{task}/prefill/" +FILTER_LOG_PATH: "instantiation/logs/{task}/filter/" \ No newline at end of file diff --git a/instantiation/controller/agent/agent.py b/instantiation/controller/agent/agent.py new file mode 100644 index 00000000..ecfabeb3 --- /dev/null +++ b/instantiation/controller/agent/agent.py @@ -0,0 +1,166 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from typing import Dict, List + +from instantiation.controller.prompter.agent_prompter import FilterPrompter, PrefillPrompter + +from ufo.agents.agent.basic import BasicAgent + + +class FilterAgent(BasicAgent): + """ + The Agent to evaluate the instantiated task is correct or not. + """ + + def __init__( + self, + name: str, + process_name: str, + is_visual: bool, + main_prompt: str, + example_prompt: str, + api_prompt: str, + ): + """ + Initialize the FilterAgent. + :param name: The name of the agent. + :param process_name: The name of the process. + :param is_visual: The flag indicating whether the agent is visual or not. + :param main_prompt: The main prompt. + :param example_prompt: The example prompt. + :param api_prompt: The API prompt. + """ + + self._step = 0 + self._complete = False + self._name = name + self._status = None + self.prompter: FilterPrompter = self.get_prompter( + is_visual, main_prompt, example_prompt, api_prompt + ) + self._process_name = process_name + + def get_prompter(self, is_visual, main_prompt, example_prompt, api_prompt) -> str: + """ + Get the prompt for the agent. + This is the abstract method from BasicAgent that needs to be implemented. + :param is_visual: The flag indicating whether the agent is visual or not. + :param main_prompt: The main prompt. + :param example_prompt: The example prompt. + :param api_prompt: The API prompt. + :return: The prompt. + """ + + return FilterPrompter(is_visual, main_prompt, example_prompt, api_prompt) + + def message_constructor(self, request: str, app: str) -> List[str]: + """ + Construct the prompt message for the FilterAgent. + :param request: The request sentence. + :param app: The name of the operated app. + :return: The prompt message. + """ + filter_agent_prompt_system_message = self.prompter.system_prompt_construction( + app=app + ) + filter_agent_prompt_user_message = self.prompter.user_content_construction( + request + ) + filter_agent_prompt_message = self.prompter.prompt_construction( + filter_agent_prompt_system_message, filter_agent_prompt_user_message + ) + + return filter_agent_prompt_message + + def process_comfirmation(self) -> None: + """ + Confirm the process. + This is the abstract method from BasicAgent that needs to be implemented. + """ + pass + + +class PrefillAgent(BasicAgent): + """ + The Agent for task instantialization and action sequence generation. + """ + + def __init__( + self, + name: str, + process_name: str, + is_visual: bool, + main_prompt: str, + example_prompt: str, + api_prompt: str, + ): + """ + Initialize the PrefillAgent. + :param name: The name of the agent. + :param process_name: The name of the process. + :param is_visual: The flag indicating whether the agent is visual or not. + :param main_prompt: The main prompt. + :param example_prompt: The example prompt. + :param api_prompt: The API prompt. + """ + + self._step = 0 + self._complete = False + self._name = name + self._status = None + self.prompter: PrefillPrompter = self.get_prompter( + is_visual, main_prompt, example_prompt, api_prompt + ) + self._process_name = process_name + + def get_prompter(self, is_visual, main_prompt, example_prompt, api_prompt) -> str: + """ + Get the prompt for the agent. + This is the abstract method from BasicAgent that needs to be implemented. + :param is_visual: The flag indicating whether the agent is visual or not. + :param main_prompt: The main prompt. + :param example_prompt: The example prompt. + :param api_prompt: The API prompt. + :return: The prompt string. + """ + + return PrefillPrompter(is_visual, main_prompt, example_prompt, api_prompt) + + def message_constructor( + self, + dynamic_examples: str, + given_task: str, + reference_steps: List[str], + doc_control_state: Dict[str, str], + log_path: str, + ) -> List[str]: + """ + Construct the prompt message for the PrefillAgent. + :param dynamic_examples: The dynamic examples retrieved from the self-demonstration and human demonstration. + :param given_task: The given task. + :param reference_steps: The reference steps. + :param doc_control_state: The document control state. + :param log_path: The path of the log. + :return: The prompt message. + """ + + prefill_agent_prompt_system_message = self.prompter.system_prompt_construction( + dynamic_examples + ) + prefill_agent_prompt_user_message = self.prompter.user_content_construction( + given_task, reference_steps, doc_control_state, log_path + ) + appagent_prompt_message = self.prompter.prompt_construction( + prefill_agent_prompt_system_message, + prefill_agent_prompt_user_message, + ) + + return appagent_prompt_message + + def process_comfirmation(self) -> None: + """ + Confirm the process. + This is the abstract method from BasicAgent that needs to be implemented. + """ + pass diff --git a/instantiation/controller/env/env_manager.py b/instantiation/controller/env/env_manager.py new file mode 100644 index 00000000..51e9056f --- /dev/null +++ b/instantiation/controller/env/env_manager.py @@ -0,0 +1,114 @@ +import logging +import re +import time + +from instantiation.config.config import Config +from fuzzywuzzy import fuzz +from pywinauto import Desktop + +from ufo.automator.puppeteer import ReceiverManager + +# Load configuration settings +_configs = Config.get_instance().config_data +if _configs is not None: + _BACKEND = _configs["CONTROL_BACKEND"] + _MATCH_STRATEGY = _configs.get("MATCH_STRATEGY", "contains") + + +class WindowsAppEnv: + """ + Represents the Windows Application Environment. + """ + + def __init__(self, app_object: object) -> None: + """ + Initializes the Windows Application Environment. + :param app_object: The app object containing information about the application. + """ + super().__init__() + self.app_window = None + self.app_root_name = app_object.app_root_name + self.app_name = app_object.description.lower() + self.win_app = app_object.win_app + self._receive_factory = ReceiverManager._receiver_factory_registry["COM"][ + "factory" + ] + self.win_com_receiver = self._receive_factory.create_receiver( + self.app_root_name, self.app_name + ) + + self._all_controls = None + + def start(self, copied_template_path: str) -> None: + """ + Starts the Windows environment. + :param copied_template_path: The file path to the copied template to start the environment. + """ + from ufo.automator.ui_control import openfile + + file_controller = openfile.FileController(_BACKEND) + try: + file_controller.execute_code( + {"APP": self.win_app, "file_path": copied_template_path} + ) + except Exception as e: + logging.exception(f"Failed to start the application: {e}") + raise + + def close(self) -> None: + """ + Closes the Windows environment. + """ + try: + com_object = self.win_com_receiver.get_object_from_process_name() + com_object.Close() + self.win_com_receiver.client.Quit() + time.sleep(1) + except Exception as e: + logging.exception(f"Failed to close the application: {e}") + raise + + def find_matching_window(self, doc_name: str) -> object: + """ + Finds a matching window based on the process name and the configured matching strategy. + :param doc_name: The document name associated with the application. + :return: The matched window or None if no match is found. + """ + desktop = Desktop(backend=_BACKEND) + windows_list = desktop.windows() + for window in windows_list: + window_title = window.element_info.name.lower() + if self._match_window_name(window_title, doc_name): + # Cache all controls for the window + self._all_controls = window.children() + return window + return None + + def _match_window_name(self, window_title: str, doc_name: str) -> bool: + """ + Matches the window name based on the strategy specified in the config file. + :param window_title: The title of the window. + :param doc_name: The document name associated with the application. + :return: True if a match is found based on the strategy; False otherwise. + """ + app_name = self.app_name + doc_name = doc_name.lower() + + if _MATCH_STRATEGY == "contains": + return app_name in window_title and doc_name in window_title + elif _MATCH_STRATEGY == "fuzzy": + similarity_app = fuzz.partial_ratio(window_title, app_name) + similarity_doc = fuzz.partial_ratio(window_title, doc_name) + return similarity_app >= 70 and similarity_doc >= 70 + elif _MATCH_STRATEGY == "regex": + combined_name_1 = f"{app_name}.*{doc_name}" + combined_name_2 = f"{doc_name}.*{app_name}" + pattern_1 = re.compile(combined_name_1, flags=re.IGNORECASE) + pattern_2 = re.compile(combined_name_2, flags=re.IGNORECASE) + return ( + re.search(pattern_1, window_title) is not None + or re.search(pattern_2, window_title) is not None + ) + else: + logging.exception(f"Unknown match strategy: {_MATCH_STRATEGY}") + raise ValueError(f"Unknown match strategy: {_MATCH_STRATEGY}") diff --git a/instantiation/controller/instantiation_process.py b/instantiation/controller/instantiation_process.py new file mode 100644 index 00000000..e3116af7 --- /dev/null +++ b/instantiation/controller/instantiation_process.py @@ -0,0 +1,228 @@ +import glob +import json +import logging +import os +import time +import traceback +from enum import Enum +from typing import Any, Dict + +from instantiation.config.config import Config + +from ufo.module.basic import BaseSession + +# Set the environment variable for the run configuration. +os.environ["RUN_CONFIGS"] = "false" + +# Load configuration data. +_configs = Config.get_instance().config_data + + +class AppEnum(Enum): + """ + Define the apps that can be used in the instantiation. + """ + + WORD = 1, "Word", ".docx", "winword" + EXCEL = 2, "Excel", ".xlsx", "excel" + POWERPOINT = 3, "PowerPoint", ".pptx", "powerpnt" + + def __init__(self, id: int, description: str, file_extension: str, win_app: str): + """ + :param id: The unique id of the app. + :param description: The description of the app. + :param file_extension: The file extension of the app. + :param win_app: The windows app name of the app. + """ + self.id = id + self.description = description + self.file_extension = file_extension + self.win_app = win_app + self.app_root_name = win_app.upper() + ".EXE" + + +class TaskObject: + """ + The task object from the json file. + """ + + def __init__(self, task_dir_name: str, task_file: str) -> None: + """ + Initialize the task object from the json file. + :param task_dir_name: The name of the directory containing the task. + :param task_file: The task file to load from. + """ + self.task_dir_name = task_dir_name + self.task_file = task_file + self.task_file_base_name = os.path.basename(task_file) + self.task_file_name = self.task_file_base_name.split(".")[0] + + with open(task_file, "r") as f: + task_json_file = json.load(f) + self.app_object = self._choose_app_from_json(task_json_file) + + for key, value in task_json_file.items(): + setattr(self, key.lower().replace(" ", "_"), value) + + def _choose_app_from_json(self, task_json_file: dict) -> AppEnum: + """ + Generate an app object by traversing AppEnum based on the app specified in the JSON. + :param task_json_file: The JSON file of the task. + :return: The app object. + """ + for app in AppEnum: + if app.description.lower() == task_json_file["app"].lower(): + return app + raise ValueError("Not a correct App") + + +class InstantiationProcess: + """ + Key process to instantiate the task. + Control the overall process. + """ + + def instantiate_files(self, task_dir_name: str) -> None: + """ + Instantiate all the task files. + :param task_dir_name: The name of the task directory. + """ + all_task_file_path: str = os.path.join( + _configs["TASKS_HUB"], task_dir_name, "*" + ) + all_task_files = glob.glob(all_task_file_path) + + for index, task_file in enumerate(all_task_files, start=1): + print(f"Task starts: {index} / {len(all_task_files)}") + try: + task_object = TaskObject(task_dir_name, task_file) + self.instantiate_single_file(task_object) + except Exception as e: + logging.exception(f"Error in task {index}: {str(e)}") + self._handle_error(task_object.task_file_base_name, e) + + print("All tasks have been processed.") + + def instantiate_single_file(self, task_object: TaskObject) -> None: + """ + Execute the process for one task. + :param task_object: The TaskObject containing task details. + """ + from instantiation.controller.env.env_manager import WindowsAppEnv + from instantiation.controller.workflow.choose_template_flow import ChooseTemplateFlow + from instantiation.controller.workflow.filter_flow import FilterFlow + from instantiation.controller.workflow.prefill_flow import PrefillFlow + + # Initialize the app environment and the task file name. + app_object = task_object.app_object + app_name = app_object.description.lower() + app_env = WindowsAppEnv(app_object) + task_file_name = task_object.task_file_name + + try: + start_time = time.time() + + # Initialize the template flow and execute it to copy the template + choose_template_flow = ChooseTemplateFlow( + app_name, app_object.file_extension, task_file_name + ) + template_copied_path = choose_template_flow.execute() + + # Initialize the prefill flow and execute it with the copied template and task details + prefill_flow = PrefillFlow(app_env, task_file_name) + instantiated_request, instantiated_plan = prefill_flow.execute( + template_copied_path, task_object.task, task_object.refined_steps + ) + + # Initialize the filter flow to evaluate the instantiated request + filter_flow = FilterFlow(app_name, task_file_name) + is_quality_good, filter_result, request_type = filter_flow.execute( + instantiated_request + ) + + # Calculate total execution time for the process + total_execution_time = round(time.time() - start_time, 3) + + # Prepare a dictionary to store the execution time for each stage + execution_time = { + "choose_template": choose_template_flow.execution_time, + "prefill": prefill_flow.execution_time, + "filter": filter_flow.execution_time, + "total": total_execution_time, + } + + # Prepare the result structure to capture the filter result + result = {"filter": filter_result} + + # Create a summary of the instantiated task information + instantiated_task_info = { + "unique_id": task_object.unique_id, + "original_task": task_object.task, + "original_steps": task_object.refined_steps, + "instantiated_request": instantiated_request, + "instantiated_plan": instantiated_plan, + "result": result, + "execution_time": execution_time, + } + + # Save the instantiated task information using the designated method + self._save_instantiated_task( + instantiated_task_info, task_object.task_file_base_name, is_quality_good + ) + except Exception as e: + logging.exception(f"Error processing task: {str(e)}") + raise + + def _handle_error(self, task_file_base_name: str, error: Exception) -> None: + """ + Handle error logging for task processing. + :param task_file_base_name: The base name of the task file. + :param error: The exception raised during processing. + """ + error_folder = os.path.join( + _configs["TASKS_HUB"], "prefill_instantiated", "instances_error" + ) + os.makedirs(error_folder, exist_ok=True) + + err_logger = BaseSession.initialize_logger( + error_folder, task_file_base_name, "w", _configs + ) + + # Use splitlines to keep the original line breaks in traceback + formatted_traceback = traceback.format_exc() + + error_log = { + "error_message": str(error), + "traceback": formatted_traceback, # Keep original traceback line breaks + } + + err_logger.error(json.dumps(error_log, ensure_ascii=False, indent=4)) + + def _save_instantiated_task( + self, + instantiated_task_info: Dict[str, Any], + task_file_base_name: str, + is_quality_good: bool, + ) -> None: + """ + Save the instantiated task information to a JSON file. + :param instantiated_task_info: A dictionary containing instantiated task details. + :param task_file_base_name: The base name of the task file. + :param is_quality_good: Indicates whether the quality of the task is good. + """ + # Convert the dictionary to a JSON string + task_json = json.dumps(instantiated_task_info, ensure_ascii=False, indent=4) + + # Define folder paths for passing and failing instances + instance_folder = os.path.join(_configs["TASKS_HUB"], "prefill_instantiated") + pass_folder = os.path.join(instance_folder, "instances_pass") + fail_folder = os.path.join(instance_folder, "instances_fail") + target_folder = pass_folder if is_quality_good else fail_folder + + new_task_path = os.path.join(target_folder, task_file_base_name) + os.makedirs(os.path.dirname(new_task_path), exist_ok=True) + + with open(new_task_path, "w", encoding="utf-8") as f: + f.write(task_json) + + print(f"Task saved to {new_task_path}") diff --git a/instantiation/controller/prompter/agent_prompter.py b/instantiation/controller/prompter/agent_prompter.py new file mode 100644 index 00000000..c3d84a29 --- /dev/null +++ b/instantiation/controller/prompter/agent_prompter.py @@ -0,0 +1,334 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import json +import os +from typing import Dict, List + +from ufo.prompter.basic import BasicPrompter + + +class FilterPrompter(BasicPrompter): + """ + Load the prompt for the FilterAgent. + """ + + def __init__( + self, + is_visual: bool, + prompt_template: str, + example_prompt_template: str, + api_prompt_template: str, + ): + """ + Initialize the FilterPrompter. + :param is_visual: The flag indicating whether the prompter is visual or not. + :param prompt_template: The prompt template. + :param example_prompt_template: The example prompt template. + :param api_prompt_template: The API prompt template. + """ + + super().__init__(is_visual, prompt_template, example_prompt_template) + self.api_prompt_template = self.load_prompt_template( + api_prompt_template, is_visual + ) + + def api_prompt_helper(self, apis: Dict = {}, verbose: int = 1) -> str: + """ + Construct the prompt for APIs. + :param apis: The APIs. + :param verbose: The verbosity level. + :return: The prompt for APIs. + """ + + # Construct the prompt for APIs + if len(apis) == 0: + api_list = [ + "- The action type are limited to {actions}.".format( + actions=list(self.api_prompt_template.keys()) + ) + ] + + # Construct the prompt for each API + for key in self.api_prompt_template.keys(): + api = self.api_prompt_template[key] + if verbose > 0: + api_text = "{summary}\n{usage}".format( + summary=api["summary"], usage=api["usage"] + ) + else: + api_text = api["summary"] + + api_list.append(api_text) + + api_prompt = self.retrived_documents_prompt_helper("", "", api_list) + else: + api_list = [ + "- The action type are limited to {actions}.".format( + actions=list(apis.keys()) + ) + ] + + # Construct the prompt for each API + for key in apis.keys(): + api = apis[key] + api_text = "{description}\n{example}".format( + description=api["description"], example=api["example"] + ) + api_list.append(api_text) + + api_prompt = self.retrived_documents_prompt_helper("", "", api_list) + + return api_prompt + + def system_prompt_construction(self, app: str = "") -> str: + """ + Construct the prompt for the system. + :param app: The app name. + :return: The prompt for the system. + """ + + try: + ans = self.prompt_template["system"] + ans = ans.format(app=app) + return ans + except Exception as e: + print(e) + + def user_prompt_construction(self, request: str) -> str: + """ + Construct the prompt for the user. + :param request: The user request. + :return: The prompt for the user. + """ + prompt = self.prompt_template["user"].format(request=request) + return prompt + + def user_content_construction(self, request: str) -> List[Dict]: + """ + Construct the prompt for LLMs. + :param request: The user request. + :return: The prompt for LLMs. + """ + + user_content = [] + + user_content.append( + {"type": "text", "text": self.user_prompt_construction(request)} + ) + + return user_content + + def examples_prompt_helper( + self, + header: str = "## Response Examples", + separator: str = "Example", + additional_examples: List[str] = [], + ) -> str: + """ + Construct the prompt for examples. + :param header: The header of the prompt. + :param separator: The separator of the prompt. + :param additional_examples: The additional examples. + :return: The prompt for examples. + """ + + template = """ + [User Request]: + {request} + [Response]: + {response} + [Tip] + {tip} + """ + + example_list = [] + + for key in self.example_prompt_template.keys(): + if key.startswith("example"): + example = template.format( + request=self.example_prompt_template[key].get("Request"), + response=json.dumps( + self.example_prompt_template[key].get("Response") + ), + tip=self.example_prompt_template[key].get("Tips", ""), + ) + example_list.append(example) + + example_list += [json.dumps(example) for example in additional_examples] + + return self.retrived_documents_prompt_helper(header, separator, example_list) + + +class PrefillPrompter(BasicPrompter): + """ + Load the prompt for the PrefillAgent. + """ + + def __init__( + self, + is_visual: bool, + prompt_template: str, + example_prompt_template: str, + api_prompt_template: str, + ): + """ + Initialize the PrefillPrompter. + :param is_visual: The flag indicating whether the prompter is visual or not. + :param prompt_template: The prompt template. + :param example_prompt_template: The example prompt template. + :param api_prompt_template: The API prompt template. + """ + + super().__init__(is_visual, prompt_template, example_prompt_template) + self.api_prompt_template = self.load_prompt_template( + api_prompt_template, is_visual + ) + + def api_prompt_helper(self, verbose: int = 1) -> str: + """ + Construct the prompt for APIs. + :param verbose: The verbosity level. + :return: The prompt for APIs. + """ + + # Construct the prompt for APIs + api_list = [ + "- The action type are limited to {actions}.".format( + actions=list(self.api_prompt_template.keys()) + ) + ] + + # Construct the prompt for each API + for key in self.api_prompt_template.keys(): + api = self.api_prompt_template[key] + if verbose > 0: + api_text = "{summary}\n{usage}".format( + summary=api["summary"], usage=api["usage"] + ) + else: + api_text = api["summary"] + + api_list.append(api_text) + + api_prompt = self.retrived_documents_prompt_helper("", "", api_list) + + return api_prompt + + def system_prompt_construction(self, additional_examples: List = []) -> str: + """ + Construct the prompt for the system. + :param additional_examples: The additional examples. + :return: The prompt for the system. + """ + + examples = self.examples_prompt_helper(additional_examples=additional_examples) + apis = self.api_prompt_helper(verbose=0) + return self.prompt_template["system"].format(apis=apis, examples=examples) + + def user_prompt_construction( + self, given_task: str, reference_steps: List, doc_control_state: Dict + ) -> str: + """ + Construct the prompt for the user. + :param given_task: The given task. + :param reference_steps: The reference steps. + :param doc_control_state: The document control state. + :return: The prompt for the user. + """ + + prompt = self.prompt_template["user"].format( + given_task=given_task, + reference_steps=json.dumps(reference_steps), + doc_control_state=json.dumps(doc_control_state), + ) + + return prompt + + def load_screenshots(self, log_path: str) -> str: + """ + Load the first and last screenshots from the log path. + :param log_path: The path of the log. + :return: The screenshot URL. + """ + from ufo.prompter.eva_prompter import EvaluationAgentPrompter + + init_image = os.path.join(log_path, "screenshot.png") + init_image_url = EvaluationAgentPrompter.load_single_screenshot(init_image) + return init_image_url + + def user_content_construction( + self, + given_task: str, + reference_steps: List, + doc_control_state: Dict, + log_path: str, + ) -> List[Dict]: + """ + Construct the prompt for LLMs. + :param given_task: The given task. + :param reference_steps: The reference steps. + :param doc_control_state: The document control state. + :param log_path: The path of the log. + :return: The prompt for LLMs. + """ + + user_content = [] + if self.is_visual: + screenshot = self.load_screenshots(log_path) + screenshot_text = """You are a action prefill agent, responsible to prefill the given task. + This is the screenshot of the current environment, please check it and give prefilled task accodingly.""" + + user_content.append({"type": "text", "text": screenshot_text}) + user_content.append({"type": "image_url", "image_url": {"url": screenshot}}) + + user_content.append( + { + "type": "text", + "text": self.user_prompt_construction( + given_task, reference_steps, doc_control_state + ), + } + ) + + return user_content + + def examples_prompt_helper( + self, + header: str = "## Response Examples", + separator: str = "Example", + additional_examples: List[str] = [], + ) -> str: + """ + Construct the prompt for examples. + :param header: The header of the prompt. + :param separator: The separator of the prompt. + :param additional_examples: The additional examples. + :return: The prompt for examples. + """ + + template = """ + [User Request]: + {request} + [Response]: + {response} + [Tip] + {tip} + """ + + example_list = [] + + for key in self.example_prompt_template.keys(): + if key.startswith("example"): + example = template.format( + request=self.example_prompt_template[key].get("Request"), + response=json.dumps( + self.example_prompt_template[key].get("Response") + ), + tip=self.example_prompt_template[key].get("Tips", ""), + ) + example_list.append(example) + + example_list += [json.dumps(example) for example in additional_examples] + + return self.retrived_documents_prompt_helper(header, separator, example_list) diff --git a/instantiation/controller/prompts/visual/api.yaml b/instantiation/controller/prompts/visual/api.yaml new file mode 100644 index 00000000..e3ba3511 --- /dev/null +++ b/instantiation/controller/prompts/visual/api.yaml @@ -0,0 +1,66 @@ +Click: + summary: |- + "Click" is to click the control item with mouse. + usage: |- + [1] API call: click_input(button=, double) + [2] Args: + - button: 'The mouse button to click. One of ''left'', ''right'', ''middle'' or ''x'' (Default: ''left'')' + - double: 'Whether to perform a double click or not (Default: False)' + [3] Example: click_input(button="left", double=False) + [4] Available control item: All control items. + [5] Return: None + + +SetText: + summary: |- + "SetText" is to input text to the control item. + usage: |- + [1] API call: set_edit_text(text="") + [2] Args: + - text: The text input to the Edit control item. It will change the content of current text in the edit block. Set text ='' if you want to clear current text in the block. You must also use Double Backslash escape character to escape the single quote in the string argument. + [3] Example: set_edit_text(text="Hello World. \\n I enjoy the reading of the book 'The Lord of the Rings'. It's a great book.") + [4] Available control item: [Edit] + [5] Return: None + +Annotate: + summary: |- + "Annotate" is to take a screenshot of the current application window and annotate the control item on the screenshot. + usage: |- + [1] API call: annotation(control_labels: List[str]=[]) + [2] Args: + - control_labels: The list of annotated label of the control item. If the list is empty, it will annotate all the control items on the screenshot. + [3] Example: annotation(control_labels=["1", "2", "3", "36", "58"]) + [4] Available control item: All control items. + [5] Return: None + +Summary: + summary: |- + "Summary" is to summarize your observation of the current application window base on the clean screenshot. This usually happens when the you need to complete the user request by summarizing or describing the information on the current application window. You must use the 'text' argument to input the summarized text. + usage: |- + [1] API call: summary(text="") + [2] Args: None + [3] Example: summary(text="The image shows a workflow of a AI agent framework. \\n The framework has three components: the 'data collection', the 'data processing' and the 'data analysis'.") + [4] Available control item: All control items. + [5] Return: the summary of the image. + +GetText: + summary: |- + "GetText" is to get the text of the control item. It typical apply to Edit and Document control item when user request is to get the text of the control item. + usage: |- + [1] API call: texts() + [2] Args: None + [3] Example: texts() + [4] All control items. + [5] Return: the text content of the control item. + +Scroll: + summary: |- + "Scroll" is to scroll the control item. It typical apply to a ScrollBar type of control item when user request is to scroll the control item, or the targeted control item is not visible nor available in the control item list, but you know the control item is in the application window and you need to scroll to find it. + usage: |- + [1] API call: wheel_mouse_input() + [2] Args: + - wheel_dist: The distance to scroll. Positive values indicate upward scrolling, negative values indicate downward scrolling. + [3] Example: wheel_mouse_input(wheel_dist=-20) + [4] All control items. + [5] Return: None + \ No newline at end of file diff --git a/instantiation/controller/prompts/visual/filter.yaml b/instantiation/controller/prompts/visual/filter.yaml new file mode 100644 index 00000000..7d25195f --- /dev/null +++ b/instantiation/controller/prompts/visual/filter.yaml @@ -0,0 +1,26 @@ +version: 1.0 + +system: |- + You are a task judge, will be provided with a task in the . You need to judge whether this task can be executed locally. + + ## Evaluation Dimension + The task is only related to {app}. + This task should be like a task, not subjective considerations. For example, if there are 'custom', 'you want' and other situations, they cannot be considered and should return false and be classified as Non_task. Any subjective will crash the system. + This task should specify the element, for example, if there are only 'text' without the specific string, they cannot be considered and should return false and be classified as Non_task. + This task should not involve interactions with other application plug-ins, etc., and only rely on Word. If 'Excel', 'Edge' and other interactions are involved, it should return false and be classified as App_involve. + This task should not involve version updates and other interactions that depend on the environment, but only rely on the current version, and do not want to be upgraded or downgraded. It should return false and be classified as Env. + There are other things that you think cannot be executed or are irrelevant, return False, and be classified as Others + + ## Response Format + Your response should be strictly structured in a JSON format, consisting of three distinct parts with the following keys and corresponding content: + {{ + "judge": true or false depends on you think this task whether can be performed. + "thought": "Outline the reason why you give the judgement." + "type": "None/Non_task/App_involve/Env/Others" + }} + Make sure you answer must be strictly in JSON format only, without other redundant text such as json header. Otherwise it will crash the system. + Below is only a example of the response. Do not fall in the example. + +user: |- + {request} + \ No newline at end of file diff --git a/instantiation/controller/prompts/visual/prefill.yaml b/instantiation/controller/prompts/visual/prefill.yaml new file mode 100644 index 00000000..46974554 --- /dev/null +++ b/instantiation/controller/prompts/visual/prefill.yaml @@ -0,0 +1,124 @@ +version: 1.0 + +system: |- + You are a Agent Task Creator and planer. + You will receive a that is abstract and your objective is to instantiate this task, and give the step-by-step actions to take. + - You are provided with a doc file environment, which contains the control information in . + - You should review the doc canvas content and control information to detail the to a .The control information is in a dict tree of available control items format. + - You are provided with , you should review the acions carefully and choose the most suitable ones step-by-step . + You are also provided with some steps to reference in + - You should also review these steps carefully, to help you instantiate the original task and give the actions. + + + ## Control item + - The control item is the element on the page that you can interact with, we limit the actionable control item to the following: + - "Button" is the control item that you can click. + - "Edit" is the control item that you can click and input text. + - "TabItem" is the control item that you can click and switch to another page. + - "ListItem" is the control item that you can click and select. + - "MenuItem" is the control item that you can click and select. + - "ScrollBar" is the control item that you can scroll. + - "TreeItem" is the control item that you can click and select. + - "Document" is the control item that you can click and select text. + - "Hyperlink" is the control item that you can click and open a link. + - "ComboBox" is the control item that you can click and input text. The Google search box is an example of ComboBox. + + ## Available Actions on the control item + - All the available actions are listed below: + {apis} + + Besides, please prefill the task based on the screenshot. you will also be provided with a screenshot, one before the agent's execution and one after the agent's execution. + + ## The requirements for + 1. The must based on the given task, but if more then one options exist in , you must choose one of them. + 2. The must be able to be completed step-by-step by a Windows Operating System or an Application on Windows platform. + 3. The should be specific and individual, you should not provide different options. + 4. You should keep clear and objective, any vague vocabulary or any subjective terms are forbidden. + 5. You should try your best not to make the become verbose, can only add up to 50 words into . + 6. The detailed target in should be specific and clear based on the doc canvas content and control information. + 7. The should be able to implemented by the available controls and actions. + + + ## The requirements for + 1. The should be step-by-step actions to take in the doc file environment. + 2. Each action should be in the available actions from . + 3. Each action should be generated with a "step" description which is the function description of the action. + 4. No need to explain the purpose of the action, just give the actions to take. + 5. Each plan should focus on a single action, if multiple actions need to be performed, you should separate them into different steps. + + ## Response Format + - You are required to response in a JSON format, consisting of several distinct parts with the following keys and corresponding content: + {{ + "observation": , + "thought": , + "new_task":, + "actions_plan": + }} + + ### Action Call Format + - The action call format is the same as the available actions in the API list.You are required to provide the action call format in a JSON format: + {{ + "step ": + "controlLabel": . If you believe none of the control item is suitable for the task or the task is complete, kindly output a empty string ''.> + "controlText": .The control text must match exactly with the selected control label. + If the function to call don't need specify controlText or the task is complete,you can kindly output an empty string ''. + If the function to call need to specify controlText and none of the control item is suitable for the task,you should input a possible control name.> + "function": + "args": + }} + + e.g. + {{ + "step 1": "change the borders", + "controlLabel": "", + "controlText": "Borders", + "function": "click_input", + "args": {{ + "button": "left", + "double": false + }} + }} + + {{ + "step 2": "change the borders", + "controlLabel": "101", + "controlText": "Borders", + "function": "click_input", + "args": {{ + "control_id": "101", + "button": "left", + "double": false + }} + }} + + {{ + "step 3": "select the target text", + "controlLabel": "", + "controlText": "", + "function": "select_text", + "args": {{ + "text": "Test For Fun" + }} + }} + + - The field must be strictly in a format separated each action call by "\n". The list format should be like this: + "action call 1\naction call 2\naction call 3" + - If you think the original task don't need to be detailed, you can directly copy the original task to the "new_task". + - You should review the apis function carefully and if the function to call need to specify target control,the 'controlText' field + cannot be set empty. + - The "step" description should be consistent with the action and also the thought. + + ## Here are some examples for you to complete the user request: + {examples} + + ## Tips + - Read the above instruction carefully. Make sure the response and action strictly following these instruction and meet the user request. + - Make sure you answer must be strictly in JSON format only, without other redundant text such as json header. Your output must be able to be able to be parsed by json.loads(). Otherwise, it will crash the system and destroy the user's computer. + - Your task is very important to improve the agent's performance. I will tip you 200$ if you do well. Thank you for your hard work! + +user: |- + {given_task} + {reference_steps} + {doc_control_state} + \ No newline at end of file diff --git a/instantiation/controller/prompts/visual/prefill_example.yaml b/instantiation/controller/prompts/visual/prefill_example.yaml new file mode 100644 index 00000000..a50635ea --- /dev/null +++ b/instantiation/controller/prompts/visual/prefill_example.yaml @@ -0,0 +1,44 @@ + +version: 1.0 + +example1: + Request: |- + Delete Text in document. + {'w:document': {'w:body': {'w:p': [{'w:r': {'@w:rsidRPr': '00E2735E', 'w:rPr': {'w:rFonts': {'@w:ascii': 'Consolas', '@w:eastAsia': 'Times New Roman', '@w:hAnsi': 'Consolas', '@w:cs': 'Times New Roman'}, 'w:sz': {'@w:val': '21'}, 'w:szCs': {'@w:val': '21'}, 'w:lang': {'@w:eastAsia': 'zh-CN'}, 'w:color': '000000'}, 'w:t': 'text to edit'}}]}}} + Response: + observation: |- + I observe the canvas state is a Word document with a body containing a paragraph with a run element, which has a text element 'text to edit'. + thought: |- + My task is to detail the given task and give the step-by-step actions to take. + The user needs to delete text in the Word document. + Based on the canvas state, there is a text element 'text to edit'. + And based on the available apis and controls,the user can use "select_text" to select the target to delete,and "type_keys" to type in delete. + Therefore,the user can detail the task to delete 'text to edit' in the Word document. + In this case, the user should select the text to edit in the Word document and press the 'Delete' key on the keyboard to delete the selected text. + new_task: |- + Delete the 'text to edit' in the Word document. + action_plans: |- + {{"step 1":"choose the target text 'text to edit'","controlLabel": "", "controlText": "", "function": "select_text", "args": {{"text": "text to edit"}}}} + {{"step 2":"type in delete keys to finish delete","controlLabel": "101", "controlText": "Edit", "function": "type_keys", "args": {{"text": "{DELETE}"}}}} + +example2: + Request: |- + Highlight Text in document. + {'w:document': {'w:body': {'w:p': [{'w:r': {'@w:rsidRPr': '00E2735E', 'w:rPr': {'w:rFonts': {'@w:ascii': 'Consolas', '@w:eastAsia': 'Times New Roman', '@w:hAnsi': 'Consolas', '@w:cs': 'Times New Roman'}, 'w:sz': {'@w:val': '21'}, 'w:szCs': {'@w:val': '21'}, 'w:lang': {'@w:eastAsia': 'zh-CN'}, 'w:color': '000000'}, 'w:t': 'text to edit'}}]}}} + Response: + observation: |- + I observe the canvas state is a Word document with a body containing a paragraph with a run element, which has a text element 'text to edit'. + thought: |- + My task is to detail the given task and give the step-by-step actions to take. + The user needs to highlight text in the Word document. + Based on the canvas state, there is a text element 'text to edit'. + And based on the available apis and controls,the user can use "select_text" to select the target to highlight and then to highlight the text. + Since there is no "Highlight" button available,I should click to the 'Home' tab first and then click the 'Highlight' button. + Therefore,the user can detail the task to highlight 'text to edit' in the Word document. + In this case, the user should select the 'text to edit' in the Word document and press the 'Home' button and 'Highlight' button respectively. + new_task: |- + Highlight 'text to edit' in the Word document. + action_plans: |- + {{"step 1":"choose the target text 'text to edit'","controlLabel": "", "controlText": "", "function": "select_text", "args": {{"text": "text to edit"}}}} + {{"step 2":"change ribbon to Home to show the highlight button","controlLabel": "11", "controlText": "Home", "function": "click_input", "args": {{"button": "left", "double": false}}}} + {{"step 3":"click the highlight button to finish highlight","controlLabel": "", "controlText": "Highlight", "function": "click_input", "args": {{"button": "left", "double": false}}}} diff --git a/instantiation/controller/workflow/choose_template_flow.py b/instantiation/controller/workflow/choose_template_flow.py new file mode 100644 index 00000000..1fac860b --- /dev/null +++ b/instantiation/controller/workflow/choose_template_flow.py @@ -0,0 +1,174 @@ +import json +import os +import random +import time +import warnings +from datetime import datetime +from pathlib import Path +from typing import Dict + +from instantiation.config.config import Config +from langchain.embeddings import CacheBackedEmbeddings +from langchain.storage import LocalFileStore +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS + +_configs = Config.get_instance().config_data + + +class ChooseTemplateFlow: + """ + Class to select and copy the most relevant template file based on the given task context. + """ + + _SENTENCE_TRANSFORMERS_PREFIX = "sentence-transformers/" + + def __init__(self, app_name: str, file_extension: str, task_file_name: str): + """ + Initialize the flow with the given task context. + :param app_name: The name of the application. + :param file_extension: The file extension of the template. + :param task_file_name: The name of the task file. + """ + self._app_name = app_name + self._file_extension = file_extension + self._task_file_name = task_file_name + self.execution_time = 0 + self._embedding_model = self._load_embedding_model( + model_name=_configs["CONTROL_FILTER_MODEL_SEMANTIC_NAME"] + ) + + def execute(self) -> str: + """ + Execute the flow and return the copied template path. + :return: The path to the copied template file. + """ + start_time = time.time() + template_copied_path = self._choose_template_and_copy() + self.execution_time = round(time.time() - start_time, 3) + return template_copied_path + + def _create_copied_file( + self, copy_from_path: Path, copy_to_folder_path: Path, file_name: str = None + ) -> str: + """ + Create a cache file from the specified source. + :param copy_from_path: The original path of the file. + :param copy_to_folder_path: The path where the cache file will be created. + :param file_name: Optional; the name of the task file. + :return: The path to the newly created cache file. + """ + os.makedirs(copy_to_folder_path, exist_ok=True) + copied_template_path = self._generate_copied_file_path( + copy_to_folder_path, file_name + ) + + with open(copy_from_path, "rb") as f: + ori_content = f.read() + with open(copied_template_path, "wb") as f: + f.write(ori_content) + + return copied_template_path + + def _generate_copied_file_path(self, folder_path: Path, file_name: str) -> str: + """ + Generate the file path for the copied template. + :param folder_path: The folder where the file will be created. + :param file_name: Optional; the name of the task file. + :return: The path to the newly created file. + """ + template_extension = self._file_extension + if file_name: + return str(folder_path / f"{file_name}{template_extension}") + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + return str(folder_path / f"{timestamp}{template_extension}") + + def _get_chosen_file_path(self) -> str: + """ + Choose the most relevant template file based on the task. + :return: The path to the most relevant template file. + """ + templates_description_path = ( + Path(_configs["TEMPLATE_PATH"]) / self._app_name / "description.json" + ) + + try: + with open(templates_description_path, "r") as f: + return self._choose_target_template_file( + self._task_file_name, json.load(f) + ) + except FileNotFoundError: + warnings.warn( + f"Warning: {templates_description_path} does not exist. Choosing a random template." + ) + return self._choose_random_template() + + def _choose_random_template(self) -> str: + """ + Select a random template file from the template folder. + :return: The path to the randomly selected template file. + """ + template_folder = Path(_configs["TEMPLATE_PATH"]) / self._app_name + template_files = [f for f in template_folder.iterdir() if f.is_file()] + + if not template_files: + raise Exception("No template files found in the specified directory.") + + chosen_template_file = random.choice(template_files) + print(f"Randomly selected template: {chosen_template_file.name}") + return str(chosen_template_file) + + def _choose_template_and_copy(self) -> str: + """ + Choose the template and copy it to the cache folder. + :return: The path to the copied template file. + """ + chosen_template_file_path = self._get_chosen_file_path() + chosen_template_full_path = ( + Path(_configs["TEMPLATE_PATH"]) / self._app_name / chosen_template_file_path + ) + + target_template_folder_path = Path(_configs["TASKS_HUB"]) / ( + os.path.dirname(os.path.dirname(self._task_file_name)) + "_templates" + ) + + return self._create_copied_file( + chosen_template_full_path, target_template_folder_path, self._task_file_name + ) + + def _choose_target_template_file( + self, given_task: str, doc_files_description: Dict[str, str] + ) -> str: + """ + Get the target file based on the semantic similarity of the given task and the template file descriptions. + :param given_task: The task to be matched. + :param doc_files_description: A dictionary of template file descriptions. + :return: The path to the chosen template file. + """ + file_doc_map = { + desc: file_name for file_name, desc in doc_files_description.items() + } + db = FAISS.from_texts( + list(doc_files_description.values()), self._embedding_model + ) + most_similar = db.similarity_search(given_task, k=1) + + if not most_similar: + raise ValueError("No similar templates found.") + + return file_doc_map[most_similar[0].page_content] + + @staticmethod + def _load_embedding_model(model_name: str) -> CacheBackedEmbeddings: + """ + Load the embedding model. + :param model_name: The name of the embedding model to load. + :return: The loaded embedding model. + """ + store = LocalFileStore(_configs["CONTROL_EMBEDDING_CACHE_PATH"]) + if not model_name.startswith(ChooseTemplateFlow._SENTENCE_TRANSFORMERS_PREFIX): + model_name = ChooseTemplateFlow._SENTENCE_TRANSFORMERS_PREFIX + model_name + embedding_model = HuggingFaceEmbeddings(model_name=model_name) + return CacheBackedEmbeddings.from_bytes_store( + embedding_model, store, namespace=model_name + ) diff --git a/instantiation/controller/workflow/filter_flow.py b/instantiation/controller/workflow/filter_flow.py new file mode 100644 index 00000000..16a447bc --- /dev/null +++ b/instantiation/controller/workflow/filter_flow.py @@ -0,0 +1,112 @@ +import json +import logging +import os +import time +from typing import Dict, Tuple + +from instantiation.config.config import Config +from instantiation.controller.agent.agent import FilterAgent + +from ufo.module.basic import BaseSession + +_configs = Config.get_instance().config_data + + +class FilterFlow: + """ + Class to refine the plan steps and prefill the file based on filtering criteria. + """ + + _app_filter_agent_dict: Dict[str, FilterAgent] = {} + + def __init__(self, app_name: str, task_file_name: str) -> None: + """ + Initialize the filter flow for a task. + :param app_object: Application object containing task details. + :param task_file_name: Name of the task file being processed. + """ + self.execution_time = 0 + self._app_name = app_name + self._log_path_configs = _configs["FILTER_LOG_PATH"].format(task=task_file_name) + self._filter_agent = self._get_or_create_filter_agent() + self._initialize_logs() + + def _get_or_create_filter_agent(self) -> FilterAgent: + """ + Retrieve or create a filter agent for the given application. + :return: FilterAgent instance for the specified application. + """ + if self._app_name not in FilterFlow._app_filter_agent_dict: + FilterFlow._app_filter_agent_dict[self._app_name] = FilterAgent( + "filter", + self._app_name, + is_visual=True, + main_prompt=_configs["FILTER_PROMPT"], + example_prompt="", + api_prompt=_configs["API_PROMPT"], + ) + return FilterFlow._app_filter_agent_dict[self._app_name] + + def execute(self, instantiated_request) -> Tuple[bool, str, str]: + """ + Execute the filter flow: Filter the task and save the result. + :param instantiated_request: Request object to be filtered. + :return: Tuple containing task quality flag, comment, and task type. + """ + start_time = time.time() + is_quality_good, filter_result, request_type = self._get_filtered_result( + instantiated_request + ) + self.execution_time = round(time.time() - start_time, 3) + return is_quality_good, filter_result, request_type + + def _initialize_logs(self) -> None: + """ + Initialize logging for filter messages and responses. + """ + os.makedirs(self._log_path_configs, exist_ok=True) + self._filter_message_logger = BaseSession.initialize_logger( + self._log_path_configs, "filter_messages.json", "w", _configs + ) + self._filter_response_logger = BaseSession.initialize_logger( + self._log_path_configs, "filter_responses.json", "w", _configs + ) + + def _get_filtered_result(self, instantiated_request) -> Tuple[bool, str, str]: + """ + Get the filtered result from the filter agent. + :param instantiated_request: Request object containing task details. + :return: Tuple containing task quality flag, request comment, and request type. + """ + # Construct the prompt message for the filter agent + prompt_message = self._filter_agent.message_constructor( + instantiated_request, + self._app_name, + ) + prompt_json = json.dumps(prompt_message, indent=4) + self._filter_message_logger.info(prompt_json) + + # Get the response from the filter agent + try: + start_time = time.time() + response_string, _ = self._filter_agent.get_response( + prompt_message, "filter", use_backup_engine=True, configs=_configs + ) + response_json = self._filter_agent.response_to_dict(response_string) + execution_time = round(time.time() - start_time, 3) + + response_json["execution_time"] = execution_time + self._filter_response_logger.info(json.dumps(response_json, indent=4)) + + return ( + response_json["judge"], + response_json["thought"], + response_json["type"], + ) + + except Exception as e: + logging.exception( + f"Error in _get_filtered_result: {str(e)} - Prompt: {prompt_message}", + exc_info=True, + ) + raise diff --git a/instantiation/controller/workflow/prefill_flow.py b/instantiation/controller/workflow/prefill_flow.py new file mode 100644 index 00000000..2fcbd536 --- /dev/null +++ b/instantiation/controller/workflow/prefill_flow.py @@ -0,0 +1,266 @@ +import json +import logging +import os +import time +from typing import Any, Dict, List, Tuple + +from instantiation.config.config import Config +from instantiation.controller.agent.agent import PrefillAgent +from instantiation.controller.env.env_manager import WindowsAppEnv + +from ufo.agents.processors.app_agent_processor import AppAgentProcessor +from ufo.automator.ui_control.inspector import ControlInspectorFacade +from ufo.automator.ui_control.screenshot import PhotographerFacade +from ufo.module.basic import BaseSession + +# Load configuration data +_configs = Config.get_instance().config_data +if _configs: + _BACKEND = _configs["CONTROL_BACKEND"] + + +class PrefillFlow(AppAgentProcessor): + """ + Class to manage the prefill process by refining planning steps and automating UI interactions + """ + + _app_prefill_agent_dict: Dict[str, PrefillAgent] = {} + + def __init__( + self, + environment: WindowsAppEnv, + task_file_name: str, + ) -> None: + """ + Initialize the prefill flow with the application context. + :param environment: The environment of the app. + :param task_file_name: The name of the task file for logging and tracking. + """ + self.execution_time = 0 + self._app_env = environment + self._task_file_name = task_file_name + self._app_name = self._app_env.app_name + + # Create or reuse a PrefillAgent for the app + if self._app_name not in PrefillFlow._app_prefill_agent_dict: + PrefillFlow._app_prefill_agent_dict[self._app_name] = PrefillAgent( + "prefill", + self._app_name, + is_visual=True, + main_prompt=_configs["PREFILL_PROMPT"], + example_prompt=_configs["PREFILL_EXAMPLE_PROMPT"], + api_prompt=_configs["API_PROMPT"], + ) + self._prefill_agent = PrefillFlow._app_prefill_agent_dict[self._app_name] + + # Initialize execution step and UI control tools + self._execute_step = 0 + self._control_inspector = ControlInspectorFacade(_BACKEND) + self._photographer = PhotographerFacade() + + # Set default states + self._status = "" + + # Initialize loggers for messages and responses + self._log_path_configs = _configs["PREFILL_LOG_PATH"].format( + task=self._task_file_name + ) + os.makedirs(self._log_path_configs, exist_ok=True) + + # Set up loggers + self._message_logger = BaseSession.initialize_logger( + self._log_path_configs, "prefill_messages.json", "w", _configs + ) + self._response_logger = BaseSession.initialize_logger( + self._log_path_configs, "prefill_responses.json", "w", _configs + ) + + def execute( + self, template_copied_path: str, original_task: str, refined_steps: List[str] + ) -> Tuple[str, List[str]]: + """ + Start the execution by retrieving the instantiated result. + :param template_copied_path: The path of the copied template to use. + :param original_task: The original task to refine. + :param refined_steps: The steps to guide the refinement process. + :return: The refined task and corresponding action plans. + """ + start_time = time.time() + instantiated_request, instantiated_plan = self._instantiate_task( + template_copied_path, original_task, refined_steps + ) + self.execution_time = round(time.time() - start_time, 3) + return instantiated_request, instantiated_plan + + def _instantiate_task( + self, template_copied_path: str, original_task: str, refined_steps: List[str] + ) -> Tuple[str, List[str]]: + """ + Retrieve and process the instantiated result for the task. + Interacts with the PrefillAgent to refine the task and generate action plans. + :param template_copied_path: The path of the copied template to use. + :param original_task: The original task to refine. + :param refined_steps: The steps to guide the refinement process. + :return: The refined task and corresponding action plans. + """ + self._app_env.start(template_copied_path) + + try: + # Retrieve prefill actions and task plan + instantiated_request, instantiated_plan = self._get_prefill_actions( + original_task, + refined_steps, + template_copied_path, + ) + + print(f"Original Task: {original_task}") + print(f"Prefilled Task: {instantiated_request}") + + except Exception as e: + logging.exception(f"Error in prefilling task: {e}") + raise + + finally: + self._app_env.close() + return instantiated_request, instantiated_plan + + def _update_state(self, file_path: str) -> None: + """ + Update the current state of the app by inspecting UI elements. + :param file_path: Path of the app file to inspect. + """ + print(f"Updating the app state using the file: {file_path}") + + # Retrieve control elements in the app window + control_list = self._control_inspector.find_control_elements_in_descendants( + self._app_env.app_window, + control_type_list=_configs["CONTROL_LIST"], + class_name_list=_configs["CONTROL_LIST"], + ) + + # Capture UI control annotations + self._annotation_dict = self._photographer.get_annotation_dict( + self._app_env.app_window, control_list, annotation_type="number" + ) + + # Filter out irrelevant control elements + self._filtered_annotation_dict = self.get_filtered_annotation_dict( + self._annotation_dict, configs=_configs + ) + + # Gather control info for both full and filtered lists + self._control_info = self._control_inspector.get_control_info_list_of_dict( + self._annotation_dict, + ["control_text", "control_type" if _BACKEND == "uia" else "control_class"], + ) + self._filtered_control_info = ( + self._control_inspector.get_control_info_list_of_dict( + self._filtered_annotation_dict, + [ + "control_text", + "control_type" if _BACKEND == "uia" else "control_class", + ], + ) + ) + + def _get_prefill_actions( + self, given_task: str, reference_steps: List[str], file_path: str + ) -> Tuple[str, List[str]]: + """ + Generate refined tasks and action plans using the PrefillAgent. + :param given_task: The task to refine. + :param reference_steps: Reference steps for the task. + :param file_path: Path to the task template. + :return: The refined task and corresponding action plans. + """ + self._update_state(file_path) + execution_time = 0 + # Save a screenshot of the app state + screenshot_path = os.path.join(self._log_path_configs, "screenshot.png") + self._save_screenshot(self._task_file_name, screenshot_path) + + # Construct prompt message for the PrefillAgent + prompt_message = self._prefill_agent.message_constructor( + "", + given_task, + reference_steps, + self._filtered_control_info, + self._log_path_configs, + ) + + # Log the constructed message + self._log_message(prompt_message) + + try: + # Record start time and get PrefillAgent response + start_time = time.time() + response_string, _ = self._prefill_agent.get_response( + prompt_message, "prefill", use_backup_engine=True, configs=_configs + ) + execution_time = round(time.time() - start_time, 3) + + # Parse and log the response + response_json = self._prefill_agent.response_to_dict(response_string) + instantiated_request = response_json["new_task"] + instantiated_plan = response_json["actions_plan"] + + except Exception as e: + self._status = "ERROR" + logging.exception(f"Error in prefilling task: {e}") + raise + finally: + # Log the response and execution time + self._log_response(response_json, execution_time) + + return instantiated_request, instantiated_plan + + def _log_message(self, prompt_message: str) -> None: + """ + Log the constructed prompt message for the PrefillAgent. + :param prompt_message: The message constructed for PrefillAgent. + """ + messages_log_entry = { + "step": self._execute_step, + "messages": prompt_message, + "error": "", + } + self._message_logger.info(json.dumps(messages_log_entry, indent=4)) + + def _log_response( + self, response_json: Dict[str, Any], execution_time: float + ) -> None: + """ + Log the response received from PrefillAgent along with execution time. + :param response_json: Response data from PrefillAgent. + :param execution_time: Time taken for the PrefillAgent call. + """ + response_log_entry = { + "step": self._execute_step, + "execution_time": execution_time, + "agent_response": response_json, + "error": "", + } + self._response_logger.info(json.dumps(response_log_entry, indent=4)) + + def _save_screenshot(self, doc_name: str, save_path: str) -> None: + """ + Captures a screenshot of the current window or the full screen if the window is not found. + :param doc_name: The name or description of the document to match the window. + :param save_path: The path where the screenshot will be saved. + """ + try: + # Find the window matching the document name + matched_window = self._app_env.find_matching_window(doc_name) + if matched_window: + screenshot = self._photographer.capture_app_window_screenshot( + matched_window + ) + else: + logging.warning("Window not found, taking a full-screen screenshot.") + screenshot = self._photographer.capture_desktop_screen_screenshot() + + screenshot.save(save_path) + print(f"Screenshot saved to {save_path}") + except Exception as e: + logging.exception(f"Failed to save screenshot: {e}") + raise diff --git a/instantiation/instantiation.py b/instantiation/instantiation.py new file mode 100644 index 00000000..2524092a --- /dev/null +++ b/instantiation/instantiation.py @@ -0,0 +1,39 @@ +import argparse +import os +import sys + + +# Add the project root to the system path. +def add_project_root_to_sys_path() -> None: + """Add project root to system path if not already present.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.abspath(os.path.join(current_dir, "..")) + if project_root not in sys.path: + sys.path.append(project_root) + + +def parse_arguments() -> argparse.Namespace: + """Parse command-line arguments. + + :return: Parsed command-line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--task", help="The name of the task.", type=str, default="prefill" + ) + return parser.parse_args() + + +def main() -> None: + """Main entry point of the script.""" + # Add the project root to the system path. + add_project_root_to_sys_path() + + task_dir_name = parse_arguments().task.lower() + + from instantiation.controller.instantiation_process import InstantiationProcess + InstantiationProcess().instantiate_files(task_dir_name) + + +if __name__ == "__main__": + main() diff --git a/instantiation/tasks/prefill/bulleted.json b/instantiation/tasks/prefill/bulleted.json new file mode 100644 index 00000000..237b68eb --- /dev/null +++ b/instantiation/tasks/prefill/bulleted.json @@ -0,0 +1,9 @@ +{ + "app": "word", + "unique_id": "5", + "task": "Turning lines of text into a bulleted list in Word", + "refined_steps": [ + "1. Place the cursor at the beginning of the line of text you want to turn into a bulleted list", + "2. Click the Bullets button in the Paragraph group on the Home tab and choose a bullet style" + ] +} \ No newline at end of file diff --git a/instantiation/tasks/prefill/delete.json b/instantiation/tasks/prefill/delete.json new file mode 100644 index 00000000..73f29eb8 --- /dev/null +++ b/instantiation/tasks/prefill/delete.json @@ -0,0 +1,9 @@ +{ + "app": "word", + "unique_id": "3", + "task": "Deleting undwanted recovered Word files", + "refined_steps": [ + "1. Open the Word document containing the items you wish to delete", + "2. Select and delete the selected text" + ] +} \ No newline at end of file diff --git a/instantiation/tasks/prefill/draw.json b/instantiation/tasks/prefill/draw.json new file mode 100644 index 00000000..2401260b --- /dev/null +++ b/instantiation/tasks/prefill/draw.json @@ -0,0 +1,10 @@ +{ + "app": "word", + "unique_id": "1", + "task": "Draw or write your signature in the Word desktop app", + "refined_steps": [ + "1. Select tool", + "2. Draw or write a signature in the Word desktop app", + "3. Use your mouse, pen, or touch screen to draw or write your signature" + ] +} \ No newline at end of file diff --git a/instantiation/tasks/prefill/macro.json b/instantiation/tasks/prefill/macro.json new file mode 100644 index 00000000..a9f18a53 --- /dev/null +++ b/instantiation/tasks/prefill/macro.json @@ -0,0 +1,9 @@ +{ + "app": "word", + "unique_id": "2", + "task": "Run a macro in Word", + "refined_steps": [ + "1. In the Macrio name box that appears, type the name of the macro you want to run", + "2. Click the Run button to execute the selected macro" + ] +} \ No newline at end of file diff --git a/instantiation/tasks/prefill/totate.json b/instantiation/tasks/prefill/totate.json new file mode 100644 index 00000000..2caa5f0b --- /dev/null +++ b/instantiation/tasks/prefill/totate.json @@ -0,0 +1,10 @@ +{ + "app": "word", + "unique_id": "4", + "task": "Rotate text in a SmartArt graphic in Word", + "refined_steps": [ + "1. Click the SmartArt graphic to select it", + "2. To rotate the text, click the Rotate button in the Arrange group on the Format tab", + "3. To rotate the text, select the desired rotation option from the drop-down menu" + ] +} \ No newline at end of file diff --git a/ufo/agents/agent/basic.py b/ufo/agents/agent/basic.py index a29787dd..1f54e08e 100644 --- a/ufo/agents/agent/basic.py +++ b/ufo/agents/agent/basic.py @@ -138,7 +138,7 @@ def message_constructor(self) -> List[Dict[str, Union[str, List[Dict[str, str]]] @classmethod def get_response( - cls, message: List[dict], namescope: str, use_backup_engine: bool + cls, message: List[dict], namescope: str, use_backup_engine: bool, configs = configs ) -> str: """ Get the response for the prompt. @@ -148,7 +148,7 @@ def get_response( :return: The response. """ response_string, cost = llm_call.get_completion( - message, namescope, use_backup_engine=use_backup_engine + message, namescope, use_backup_engine=use_backup_engine, configs = configs ) return response_string, cost @@ -236,6 +236,7 @@ def process_resume(self) -> None: if self.processor: self.processor.resume() + def process_asker(self, ask_user: bool = True) -> None: """ Ask for the process. diff --git a/ufo/agents/processors/app_agent_processor.py b/ufo/agents/processors/app_agent_processor.py index aaa21020..bc6aa896 100644 --- a/ufo/agents/processors/app_agent_processor.py +++ b/ufo/agents/processors/app_agent_processor.py @@ -21,7 +21,8 @@ from ufo.agents.agent.app_agent import AppAgent configs = Config.get_instance().config_data -BACKEND = configs["CONTROL_BACKEND"] +if configs is not None: + BACKEND = configs["CONTROL_BACKEND"] class AppAgentProcessor(BaseProcessor): @@ -494,7 +495,8 @@ def demonstration_prompt_helper(self) -> Tuple[List[str], List[str]]: return examples, tips def get_filtered_annotation_dict( - self, annotation_dict: Dict[str, UIAWrapper] + self, annotation_dict: Dict[str, UIAWrapper], + configs = configs ) -> Dict[str, UIAWrapper]: """ Get the filtered annotation dictionary. diff --git a/ufo/agents/processors/basic.py b/ufo/agents/processors/basic.py index 515192ac..9e10c27b 100644 --- a/ufo/agents/processors/basic.py +++ b/ufo/agents/processors/basic.py @@ -20,7 +20,8 @@ from ufo.module.context import Context, ContextNames configs = Config.get_instance().config_data -BACKEND = configs["CONTROL_BACKEND"] +if configs is not None: + BACKEND = configs["CONTROL_BACKEND"] class BaseProcessor(ABC): diff --git a/ufo/agents/processors/host_agent_processor.py b/ufo/agents/processors/host_agent_processor.py index cdd4d946..bb5424f0 100644 --- a/ufo/agents/processors/host_agent_processor.py +++ b/ufo/agents/processors/host_agent_processor.py @@ -13,7 +13,8 @@ from ufo.module.context import Context, ContextNames configs = Config.get_instance().config_data -BACKEND = configs["CONTROL_BACKEND"] +if configs is not None: + BACKEND = configs["CONTROL_BACKEND"] if TYPE_CHECKING: from ufo.agents.agent.host_agent import HostAgent diff --git a/ufo/automator/ui_control/controller.py b/ufo/automator/ui_control/controller.py index 6958613d..7db40abd 100644 --- a/ufo/automator/ui_control/controller.py +++ b/ufo/automator/ui_control/controller.py @@ -18,7 +18,7 @@ configs = Config.get_instance().config_data -if configs.get("AFTER_CLICK_WAIT", None) is not None: +if configs is not None and configs.get("AFTER_CLICK_WAIT", None) is not None: pywinauto.timings.Timings.after_clickinput_wait = configs["AFTER_CLICK_WAIT"] pywinauto.timings.Timings.after_click_wait = configs["AFTER_CLICK_WAIT"] diff --git a/ufo/automator/ui_control/openfile.py b/ufo/automator/ui_control/openfile.py index 0d7a8561..0fd8f345 100644 --- a/ufo/automator/ui_control/openfile.py +++ b/ufo/automator/ui_control/openfile.py @@ -9,7 +9,10 @@ configs = Config.get_instance().config_data -BACKEND = configs["CONTROL_BACKEND"] +if configs is not None: + BACKEND = configs["CONTROL_BACKEND"] +else: + BACKEND = "uia" class FileController: @@ -17,9 +20,9 @@ class FileController: Control block for open file / specific APP and proceed the operation. """ - def __init__(self): + def __init__(self, backend=BACKEND): - self.backend = BACKEND + self.backend = backend self.file_path = "" self.APP = "" self.apptype = "" diff --git a/ufo/automator/ui_control/screenshot.py b/ufo/automator/ui_control/screenshot.py index fb032170..dd85c14e 100644 --- a/ufo/automator/ui_control/screenshot.py +++ b/ufo/automator/ui_control/screenshot.py @@ -17,7 +17,10 @@ configs = Config.get_instance().config_data -DEFAULT_PNG_COMPRESS_LEVEL = int(configs["DEFAULT_PNG_COMPRESS_LEVEL"]) +if configs is not None: + DEFAULT_PNG_COMPRESS_LEVEL = int(configs["DEFAULT_PNG_COMPRESS_LEVEL"]) +else: + DEFAULT_PNG_COMPRESS_LEVEL = 6 class Photographer(ABC): diff --git a/ufo/config/config.py b/ufo/config/config.py index 7df96a31..99036f91 100644 --- a/ufo/config/config.py +++ b/ufo/config/config.py @@ -14,7 +14,10 @@ class Config: def __init__(self): # Load config here - self.config_data = self.load_config() + if os.getenv("RUN_CONFIGS", "true").lower() != "false": + self.config_data = self.load_config() + else: + self.config_data = None @staticmethod def get_instance(): @@ -26,7 +29,7 @@ def get_instance(): Config._instance = Config() return Config._instance - def load_config(self, config_path="ufo/config/") -> dict: + def load_config(self, config_path = "ufo/config/") -> dict: """ Load the configuration from a YAML file and environment variables. @@ -45,14 +48,13 @@ def load_config(self, config_path="ufo/config/") -> dict: # Update configs with YAML data if yaml_data: configs.update(yaml_data) - with open(path + "config_dev.yaml", "r") as file: - yaml_dev_data = yaml.safe_load(file) - with open(path + "config_prices.yaml", "r") as file: - yaml_prices_data = yaml.safe_load(file) - # Update configs with YAML data - if yaml_data: + if os.path.exists(path + "config_dev.yaml"): + with open(path + "config_dev.yaml", "r") as file: + yaml_dev_data = yaml.safe_load(file) configs.update(yaml_dev_data) - if yaml_prices_data: + if os.path.exists(path + "config_prices.yaml"): + with open(path + "config_prices.yaml", "r") as file: + yaml_prices_data = yaml.safe_load(file) configs.update(yaml_prices_data) except FileNotFoundError: print_with_color( diff --git a/ufo/llm/llm_call.py b/ufo/llm/llm_call.py index 78a302be..b47ac9e5 100644 --- a/ufo/llm/llm_call.py +++ b/ufo/llm/llm_call.py @@ -12,7 +12,7 @@ def get_completion( - messages, agent: str = "APP", use_backup_engine: bool = True + messages, agent: str = "APP", use_backup_engine: bool = True, configs = configs ) -> Tuple[str, float]: """ Get completion for the given messages. @@ -23,13 +23,14 @@ def get_completion( """ responses, cost = get_completions( - messages, agent=agent, use_backup_engine=use_backup_engine, n=1 + messages, agent=agent, use_backup_engine=use_backup_engine, n=1, configs = configs ) return responses[0], cost def get_completions( - messages, agent: str = "APP", use_backup_engine: bool = True, n: int = 1 + messages, agent: str = "APP", use_backup_engine: bool = True, n: int = 1, + configs = configs ) -> Tuple[list, float]: """ Get completions for the given messages. @@ -44,6 +45,10 @@ def get_completions( agent_type = "HOST_AGENT" elif agent.lower() in ["app", "appagent"]: agent_type = "APP_AGENT" + elif agent.lower() == "prefill": + agent_type = "PREFILL_AGENT" + elif agent.lower() == "filter": + agent_type = "FILTER_AGENT" elif agent.lower() == "backup": agent_type = "BACKUP_AGENT" else: diff --git a/ufo/llm/openai.py b/ufo/llm/openai.py index 9765cc33..75f985a6 100644 --- a/ufo/llm/openai.py +++ b/ufo/llm/openai.py @@ -29,7 +29,7 @@ def __init__(self, config: Dict[str, Any], agent_type: str) -> None: self.config = config self.api_type = self.config_llm["API_TYPE"].lower() self.max_retry = self.config["MAX_RETRY"] - self.prices = self.config["PRICES"] + self.prices = self.config.get("PRICES", {}) assert self.api_type in ["openai", "aoai", "azure_ad"], "Invalid API type" self.client: OpenAI = OpenAIService.get_openai_client( diff --git a/ufo/module/basic.py b/ufo/module/basic.py index e12b6058..47537ce4 100644 --- a/ufo/module/basic.py +++ b/ufo/module/basic.py @@ -657,7 +657,7 @@ def capture_last_snapshot(self) -> None: app_agent.Puppeteer.save_to_xml(xml_save_path) @staticmethod - def initialize_logger(log_path: str, log_filename: str) -> logging.Logger: + def initialize_logger(log_path: str, log_filename: str, mode='a', configs = configs) -> logging.Logger: """ Initialize logging. log_path: The path of the log file. @@ -672,7 +672,7 @@ def initialize_logger(log_path: str, log_filename: str) -> logging.Logger: logger.handlers = [] log_file_path = os.path.join(log_path, log_filename) - file_handler = logging.FileHandler(log_file_path, encoding="utf-8") + file_handler = logging.FileHandler(log_file_path, mode = mode, encoding="utf-8") formatter = logging.Formatter("%(message)s") file_handler.setFormatter(formatter) logger.addHandler(file_handler)