-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathagent.py
164 lines (133 loc) · 5.23 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import cv2
import base64
from openai import OpenAI
from tts import AudioGenerator
from robot import Robot
from pydantic import BaseModel
class Movement(BaseModel):
servoID: int
change: int
class Response(BaseModel):
analysis: str
done: bool
movement: Movement | None
class Agent:
def __init__(self, use_bot: bool = True):
self.audio_generator = AudioGenerator()
self.client = OpenAI()
self.window_name = "Camera Feed"
self.use_bot = use_bot
if self.use_bot:
self.robot = Robot()
def encode_frame(self, frame):
"""Convert cv2 frame to base64 string"""
_, buffer = cv2.imencode('.jpg', frame)
return base64.b64encode(buffer).decode('utf-8')
def get_camera_frame(self):
"""Get a single frame from the camera"""
cap = cv2.VideoCapture(0)
ret, frame = cap.read()
cap.release()
if not ret:
raise RuntimeError("Failed to grab frame from camera")
print(f"Frame size: {frame.shape[1]}x{frame.shape[0]}")
return frame
def analyze_frame(self, frame):
"""Get GPT-4V analysis of frame"""
base64_frame = self.encode_frame(frame)
response = self.client.beta.chat.completions.create(
# model="gpt-4o-mini",
model="gpt-4o",
messages=[{
"role": "user",
"content": [{
"type": "text",
"text": "Describe the contents of the image in detail."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_frame}",
"detail": "high"
}
}]
}],
max_tokens=300
)
return response.choices[0].message.content
def analyze_with_prompt(self, frame, command: str):
"""Send frame to GPT-4V with custom prompt"""
response = self.client.beta.chat.completions.parse(
model="gpt-4o",
messages=[{
"role": "user",
"content": [{
"type": "text",
"text": command
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{self.encode_frame(frame)}",
"detail": "high"
}
}]
}],
response_format=Response,
temperature=0.7,
max_tokens=300,
timeout=30
)
parsed_response = response.choices[0].message.parsed
print(parsed_response)
return parsed_response
def run(self, command: str):
done = False
while not done:
"""Get single frame analysis with custom prompt"""
frame = self.get_camera_frame()
response = self.analyze_with_prompt(frame, command)
print("\nGPT-4V Analysis:")
print(response)
# self.audio_generator.say(response.analysis)
if self.use_bot:
if response.movement:
try:
self.robot.move_relative(response.movement.servoID, response.movement.change)
except Exception as e:
print(f"Error moving robot: {e}")
done = response.done
return response
if __name__ == '__main__':
bot_description = """This is a 6-DOF (degrees of freedom) robotic arm with a gripper end effector. It communicates via serial over COM ports and uses position tracking for precise control. Each servo uses a position system where 4000 ticks represents a full 360-degree rotation.
Gripper (ServoID: 1)
Opens and closes the end effector
Positive change = Opens more
Negative change = Closes
2. Wrist Rotator (ServoID: 2)
Rotates the gripper around its axis
Higher number = Rotates clockwise (when looking down at gripper)
Wrist Bend (ServoID: 3)
Controls the up/down bend at the wrist joint
Higher number = Bends upward
Elbow (ServoID: 4)
Controls the elbow joint bend
Higher number = Bends upward (raises forearm)
Shoulder (ServoID: 5)
Controls the shoulder joint bend
Higher number = Bends upward (raises upper arm)
Base (ServoID: 6)
Rotates the entire arm at its base
Higher number = Rotates clockwise (when looking down at base)
The agent operates in a continuous loop until the task is complete:
1. Takes a picture from the camera
2. Sends the image to GPT-4V along with the command and bot description
3. GPT-4V analyzes the image and suggests a small movement (e.g. +-50 ticks)
4. The robot executes that movement
5. Loop repeats from step 1 until GPT-4V sets done=True
This allows GPT-4V to make small, iterative adjustments while getting visual feedback after each movement.
CURRENT COMMAND:
Close the gripper. Use stepamounts of +-50 ticks. Set Done to True when gripper is closed.
"""
agent = Agent(use_bot=True)
agent.run(bot_description)