instruction_augment.py

#!/usr/bin/env python

import json

from LMs.gpt import GPT, GPTAssistant
from utils.llm_utils import call_method, get_env_variable
import pddl
from pddl.core import Problem


def augment_instruction(instruction, image, pcd, **kwargs):
    """
    Construct PDDL description from instruction, image, and point cloud.
    For example, get the object name from the instruction, and the object embedding from the image(CLIP) and the position from the point cloud.
    """
    room_domain = pddl.parse_domain("domains/room.pddl")
    pddl_description = ""

    ## first get the object name and related information from the instruction
    azure_openai_key = get_env_variable("AZURE_OPENAI_KEY")
    azure_endpoint = get_env_variable("AZURE_OPENAI_ENDPOINT")
    gpt = GPTAssistant(
        azure_openai_key=azure_openai_key,
        endpoint=azure_endpoint,
        model_name="gpt-4-32k",
    )
    with open("prompts/assistant/system.txt", "r") as f:
        system_msg = f.read()
    gpt.create_prompt(system_msg)
    with open("prompts/assistant/query.txt", "r") as f:
        query_msg = f.read()
    query_msg += "\n" + instruction
    response = gpt.generate(content=query_msg)
    # transform the response to json
    response = response[response.find("{") :]
    response = response[: response.rfind("}") + 1]
    response = json.loads(response)

    # get the objects
    obj1 = response['object_name']
    obj2 = response['related_object_name']
    relation = response['relation']

    # then get the object embedding from the image

    # then get the position from the point cloud

    # finally construct the BDDL description

    problem = Problem(
        instruction,
        domain=room_domain,
        requirements=room_domain.requirements,
        objects=[a, b, c],
        init=[p1(a, b, c), ~p2(b, c)],
        goal=p2(b, c)
    )

    return