-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimizer.py
225 lines (181 loc) · 8.72 KB
/
optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import re
import json
from typing import List, Dict, Any, Optional
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
# Download necessary NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Load spaCy model
try:
nlp = spacy.load("en_core_web_sm")
except:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Load prompt templates and best practices
with open("data/prompt_templates.json", "r") as f:
PROMPT_TEMPLATES = json.load(f)
with open("data/model_best_practices.json", "r") as f:
MODEL_BEST_PRACTICES = json.load(f)
# Initialize models
generator = pipeline("text-generation", model="gpt2")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
def generate_prompt(goal: str, target_model: str, context: str = None,
style: str = "detailed", formats: List[str] = ["standard"]) -> str:
"""
Generate a prompt based on user goal and target AI model
Args:
goal: The user's goal for the prompt
target_model: The target AI model (e.g., "chatgpt", "claude", "gemini")
context: Additional context for the prompt
style: Style of the prompt (concise, detailed, step-by-step)
formats: List of prompt formats to include
Returns:
A generated prompt optimized for the target model
"""
target_model = target_model.lower().strip()
template = PROMPT_TEMPLATES.get(target_model, PROMPT_TEMPLATES["default"])
practices = MODEL_BEST_PRACTICES.get(target_model, MODEL_BEST_PRACTICES["default"])
components = []
if "persona" in formats:
components.append(f"As an expert {extract_domain(goal)},")
components.append(goal)
if context:
components.append(f"Context: {context}")
if style == "detailed":
components.append(practices["detailed_format"])
elif style == "step-by-step":
components.append(practices["step_instructions"])
elif style == "concise":
components.append(practices["concise_format"])
prompt = f"{template['prefix']} {' '.join(components)}"
if "constraints" in formats:
prompt += f"\n\n{practices['constraints']}"
if "examples" in formats:
prompt += f"\n\n{practices['example_format']}"
prompt += template["suffix"]
prompt = re.sub(r'\s+', ' ', prompt).strip()
prompt = prompt.replace(" .", ".").replace(" ,", ",")
return prompt
def _optimize_minimal(prompt: str) -> str:
"""Clean up and add minimal formatting."""
return clean_prompt(prompt)
def _optimize_balanced(prompt: str, practices: Dict[str, str]) -> str:
"""Restructure and enhance the prompt."""
return enhance_prompt_structure(prompt, practices)
def _optimize_maximum(prompt: str, target_model: str, practices: Dict[str, str]) -> str:
"""Completely rewrite the prompt with model-specific optimizations."""
return rewrite_prompt(prompt, target_model, practices)
def optimize_prompt(prompt: str, target_model: str, optimization_level: str = "balanced") -> str:
"""
Optimize an existing prompt for better results
Args:
prompt: The existing prompt to optimize
target_model: The target AI model
optimization_level: How aggressively to optimize (minimal, balanced, maximum)
Returns:
An optimized version of the prompt
"""
doc = nlp(prompt)
target_model = target_model.lower().strip()
practices = MODEL_BEST_PRACTICES.get(target_model, MODEL_BEST_PRACTICES["default"])
sentences = sent_tokenize(prompt)
if optimization_level == "minimal":
optimized = _optimize_minimal(prompt)
elif optimization_level == "balanced":
optimized = _optimize_balanced(prompt, practices)
else: # maximum
optimized = _optimize_maximum(prompt, target_model, practices)
return optimized
def clean_prompt(prompt: str) -> str:
"""Clean and format a prompt with minimal changes"""
# Remove excessive whitespace
cleaned = re.sub(r'\s+', ' ', prompt).strip()
# Fix basic punctuation issues
cleaned = re.sub(r'\s([,.!?;:])', r'\1', cleaned)
# Ensure the prompt ends with a clear instruction or question
if not re.search(r'[.!?]$', cleaned):
cleaned += "."
return cleaned
def enhance_prompt_structure(prompt: str, practices: Dict[str, str]) -> str:
"""Enhance prompt structure while preserving core content"""
sentences = sent_tokenize(prompt)
# Identify parts of the prompt
intro_part = sentences[0] if sentences else ""
body_parts = sentences[1:-1] if len(sentences) > 2 else sentences[1:] if len(sentences) > 1 else []
conclusion_part = sentences[-1] if len(sentences) > 1 else ""
# Enhance introduction with clear role/task
if not re.search(r'(you are|act as|as an?|assume the role)', intro_part.lower()):
domain = extract_domain(prompt)
enhanced_intro = f"As a specialized {domain} expert, {intro_part}"
else:
enhanced_intro = intro_part
# Enhance body with structure markers
enhanced_body = []
for i, part in enumerate(body_parts):
if len(part) > 100 and "," in part: # Long complex sentence
subparts = part.split(", ")
if len(subparts) > 2:
# Convert to bullet points
enhanced_body.append(f"Key points:")
enhanced_body.extend([f"- {subpart.strip()}" for subpart in subparts])
continue
enhanced_body.append(part)
# Enhance conclusion with clear output expectations
if not re.search(r'(please provide|i need|output format|format your response)', conclusion_part.lower()):
enhanced_conclusion = f"{conclusion_part} {practices['output_format']}"
else:
enhanced_conclusion = conclusion_part
# Combine enhanced parts
result = f"{enhanced_intro} {' '.join(enhanced_body)} {enhanced_conclusion}"
# Add optimization hint based on target model
result += f"\n\n{practices['optimization_hint']}"
return result
def rewrite_prompt(prompt: str, target_model: str, practices: Dict[str, str]) -> str:
"""Completely rewrite a prompt for optimal results"""
# Extract core intent and key concepts
doc = nlp(prompt)
key_phrases = [chunk.text for chunk in doc.noun_chunks]
verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
# Use embedding model to find most similar template
prompt_embedding = embedding_model.encode(prompt, convert_to_tensor=True)
template_texts = list(PROMPT_TEMPLATES.keys())
template_embeddings = embedding_model.encode(template_texts, convert_to_tensor=True)
similarities = util.pytorch_cos_sim(prompt_embedding, template_embeddings)[0]
best_template_idx = similarities.argmax().item()
template = PROMPT_TEMPLATES[template_texts[best_template_idx]]
# Generate new prompt using transformer pipeline
# This is just a placeholder - in a real app you'd use a more sophisticated approach
generation_prompt = f"Rewrite this prompt for {target_model}: {prompt}\n\nOptimized version:"
generated = generator(generation_prompt, max_length=150, num_return_sequences=1)[0]['generated_text']
# Extract the generated part
rewritten = generated.split("Optimized version:")[-1].strip()
# Apply model-specific formatting
model_format = practices["detailed_format"]
if not any(marker in rewritten.lower() for marker in ["step", "bullet", "1.", "i.", "•"]):
rewritten += f"\n\n{model_format}"
return rewritten
def extract_domain(text: str) -> str:
"""Extract likely domain/field from text"""
# Define common domains
domains = ["AI", "machine learning", "data science", "marketing", "business", "writing",
"programming", "development", "design", "research", "teaching", "academic",
"engineering", "healthcare", "technology", "science", "communication"]
# Check for explicit domain mentions
for domain in domains:
if domain.lower() in text.lower():
return domain
# Default domains based on common words
if any(word in text.lower() for word in ["code", "programming", "algorithm", "software", "developer"]):
return "software engineering"
elif any(word in text.lower() for word in ["write", "essay", "blog", "article", "content"]):
return "content creation"
elif any(word in text.lower() for word in ["analyze", "research", "study", "investigate"]):
return "analytical research"
else:
return "AI assistant"