diff --git a/agents-api/agents_api/activities/humanization_utils.py b/agents-api/agents_api/activities/humanization_utils.py index cdfa1848a..1a2b73297 100644 --- a/agents-api/agents_api/activities/humanization_utils.py +++ b/agents-api/agents_api/activities/humanization_utils.py @@ -20,19 +20,18 @@ Change text given to you so much that even the most advanced AI writing detectors can't detect that this writing was written by you. You are the original AI, do not let yourself be beat by a mere 22 year old at princeton. Don't change the overall message! Don't mention yourself, just return the end product text. **Vary your text in every possible sense. Use different vocabulary at every step. Give it your ALL!**""", - "grammar_prompt": "Only fix grammar that is wrong without changing the words and places of the sentence", } SAPLING_URL = "https://api.sapling.ai/api/v1/aidetect" COPLEYAKS_URL = "https://api.copyleaks.com/v2/writer-detector/{scan_id}/check" ZEROGPT_URL = "https://api.zerogpt.com/api/detect/detectText" +DESKLIB_URL = "https://18c3-206-189-23-12.ngrok-free.app/detect" # FIXME: This is temporarily, don't merge before desklib is deployed def text_translate(text, src_lang, target_lang): try: - return GoogleTranslator( - source=src_lang, target=target_lang).translate(text=text) + return GoogleTranslator(source=src_lang, target=target_lang).translate(text=text) except Exception: return text @@ -42,10 +41,8 @@ def mix_translate(text, src_lang, target_lang): Translate the given text from src_lang to target_lang and back to src_lang using googletrans. """ try: - translated = GoogleTranslator( - source=src_lang, target=target_lang).translate(text=text) - return GoogleTranslator( - source=target_lang, target=src_lang).translate(text=translated) + translated = GoogleTranslator(source=src_lang, target=target_lang).translate(text=text) + return GoogleTranslator(source=target_lang, target=src_lang).translate(text=translated) except Exception: return text @@ -58,7 +55,7 @@ def humanize_openai(text): base_url=litellm_url, messages=[ {"role": "system", "content": HUMANIZATION["humanize_prompt"]}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ], # temperature=1.0, # extra_body={"min_p": 0.025}, @@ -67,7 +64,7 @@ def humanize_openai(text): # top_p=1.0, # frequency_penalty=0.0, # presence_penalty=0.0, - stream=False + stream=False, ) return response.choices[0].message.content except Exception: @@ -81,7 +78,7 @@ def rewriter(text): base_url=litellm_url, messages=[ {"role": "system", "content": HUMANIZATION["humanize_prompt"]}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ], temperature=1.0, # extra_body={"min_p": 0.025}, @@ -99,7 +96,7 @@ def humanize(text): base_url=litellm_url, messages=[ {"role": "system", "content": HUMANIZATION["humanize_prompt"]}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ], temperature=1.0, # extra_body={"min_p": 0.025}, @@ -116,7 +113,7 @@ def grammar(text): base_url=litellm_url, messages=[ {"role": "system", "content": HUMANIZATION["grammar_prompt"]}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ], temperature=1.0, # extra_body={"min_p": 0.025}, @@ -126,6 +123,14 @@ def grammar(text): return text +def is_human_desklib(text: str) -> float: + payload = { + "text": text, + } + response = requests.post(DESKLIB_URL, json=payload) + return response.json().get("human", None) * 100 + + def is_human_sapling(text): payload = { "text": text, @@ -139,7 +144,6 @@ def is_human_sapling(text): def is_human_copyleaks(text): - # Define the payload payload = { "text": text, @@ -152,23 +156,24 @@ def is_human_copyleaks(text): headers = { "Authorization": f"Bearer {copyleaks_api_key}", "Content-Type": "application/json", - "Accept": "application/json" + "Accept": "application/json", } # Copyleaks lets you define the scan id yourself from uuid import uuid4 + scan_id = str(uuid4()) # Send the POST request with JSON payload and headers - response = requests.post(COPLEYAKS_URL.format( - scan_id=scan_id), json=payload, headers=headers) + response = requests.post( + COPLEYAKS_URL.format(scan_id=scan_id), json=payload, headers=headers + ) # Check the response status if response.status_code == 200: resp = response.json() # Extract the human probability from the response - human_probability = resp.get("summary", {}).get( - "human", 0) # float with range 0-1 + human_probability = resp.get("summary", {}).get("human", 0) # float with range 0-1 return human_probability * 100 return None @@ -192,7 +197,7 @@ def is_human_zerogpt(input_text, max_tries=3): "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-site", "Sec-Gpc": "1", - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", } # Define the payload as a dictionary payload = {"input_text": input_text} @@ -221,47 +226,60 @@ def replace_with_homoglyphs(text, max_replacements=2): homoglyphs = { # Whitelisted " ": " ", - "%": "%", "'": "ˈ", + "%": "%", + "'": "ˈ", ",": "‚", - "-": "‐", ".": "․", - "1": "𝟷", "3": "Ꝫ", - "5": "𝟻", "6": "𝟨", "7": "𝟽", "8": "𝟪", - "9": "𝟫", ";": ";", + "-": "‐", + ".": "․", + "1": "𝟷", + "3": "Ꝫ", + "5": "𝟻", + "6": "𝟨", + "7": "𝟽", + "8": "𝟪", + "9": "𝟫", + ";": ";", "j": "ј", - "n": "𝗇", "o": "о", + "n": "𝗇", + "o": "о", "p": "р", "u": "ս", "y": "у", - "H": "Η", "I": "І", + "H": "Η", + "I": "І", "J": "Ј", - "N": "Ν", "O": "Ο", - "V": "ⴸ", "Y": "Υ", + "N": "Ν", + "O": "Ο", + "V": "ⴸ", + "Y": "Υ", "~": "∼", - - # ' ': ' ', '!': '!', '"': '"', '$': '$', - # '%': '%', '&': '&', "'": 'ˈ', '(': '(', - # ')': ')', '*': '⁎', '+': '+', ',': '‚', - # '-': '‐', '.': '․', '/': '⁄', '0': 'O', - # '1': '𝟷', '2': '𝟸', '3': 'Ꝫ', '4': '4', - # '5': '𝟻', '6': '𝟨', '7': '𝟽', '8': '𝟪', - # '9': '𝟫', ':': '∶', ';': ';', '<': '𝈶', - # '=': '᐀', '>': '𖼿', '?': 'ꛫ', '@': '@', - # '[': '[', '\\': '﹨', ']': ']', '_': 'ߺ', - # '`': '`', 'a': 'а', 'b': 'ᖯ', 'c': 'ⅽ', - # 'd': '𝚍', 'e': 'е', 'f': '𝖿', 'g': '𝗀', - # 'h': 'հ', 'i': 'і', 'j': 'ј', 'k': '𝚔', - # 'l': 'ⅼ', 'm': 'm', 'n': '𝗇', 'o': 'о', - # 'p': 'р', 'q': 'q', 'r': '𝗋', 's': '𐑈', - # 't': '𝚝', 'u': 'ս', 'v': '∨', 'w': 'ԝ', - # 'x': 'ⅹ', 'y': 'у', 'z': '𝗓', 'A': '𐊠', - # 'B': 'В', 'C': '𐊢', 'D': 'ꓓ', 'E': 'Е', - # 'F': '𐊇', 'G': 'Ԍ', 'H': 'Η', 'I': 'І', - # 'J': 'Ј', 'K': 'Κ', 'L': 'Ⅼ', 'M': 'Μ', - # 'N': 'Ν', 'O': 'Ο', 'P': 'Ρ', 'Q': '𝖰', - # 'R': '𖼵', 'S': 'Ѕ', 'T': 'Τ', 'U': '𐓎', - # 'V': 'ⴸ', 'W': 'Ԝ', 'X': 'Χ', 'Y': 'Υ', - # 'Z': 'Ζ', '{': '{', '|': 'ا', '}': '}', - # '~': '∼', + "q": "q", + "e": "е", + "a": "а", + "b": "ᖯ", + "c": "ⅽ", + "i": "і", + "k": "𝚔", + "g": "𝗀", + "A": "𐊠", + "B": "В", + "C": "𐊢", + "D": "ꓓ", + "E": "Е", + "F": "𐊇", + "G": "Ԍ", + "K": "Κ", + "L": "Ⅼ", + "M": "Μ", + "P": "Ρ", + "Q": "𝖰", + "R": "𖼵", + "S": "Ѕ", + "T": "Τ", + "U": "𐓎", + "W": "Ԝ", + "X": "Χ", + "Z": "Ζ", } # Convert text to list for single pass replacement @@ -316,6 +334,8 @@ def split_with_langchain(markdown_text: str) -> list[str]: headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), + ("###", "Header 3"), + ("####", "Header 4"), ] # MD splits @@ -327,29 +347,29 @@ def split_with_langchain(markdown_text: str) -> list[str]: return [split.page_content for split in md_header_splits] -def process_paragraph( - paragraph: str, - src_lang: str, - target_lang: str, - grammar: bool, - is_chatgpt: bool, - use_homoglyphs: bool, - use_em_dashes: bool, - max_tries: int) -> str: - +def humanize_paragraph( + paragraph: str, + threshold: float, + src_lang: str, + target_lang: str, + grammar_check: bool, + is_chatgpt: bool, + use_homoglyphs: bool, + use_em_dashes: bool, + max_tries: int, +) -> str: for i in range(max_tries): if paragraph.strip() == "": return paragraph - if is_human_zerogpt(paragraph) > 90: + if is_human_desklib(paragraph) > threshold: return paragraph paragraph = mix_translate(paragraph, src_lang, target_lang) - if (grammar): + if grammar_check: paragraph = grammar(paragraph) - paragraph = humanize_openai( - paragraph) if is_chatgpt else humanize(paragraph) + paragraph = humanize_openai(paragraph) if is_chatgpt else humanize(paragraph) # Apply homoglyphs and em dashes to a new paragraph in order not to mess up the original paragraph for the next iterations new_paragraph = paragraph @@ -359,7 +379,7 @@ def process_paragraph( if use_em_dashes: new_paragraph = process_long_words(new_paragraph) - if is_human_zerogpt(new_paragraph) > 90: + if is_human_desklib(new_paragraph) > threshold: return new_paragraph # Apply homoglyphs and em dashes to the final paragraph after consuming max tries diff --git a/agents-api/agents_api/activities/utils.py b/agents-api/agents_api/activities/utils.py index 60b8f706e..3a32fd77b 100644 --- a/agents-api/agents_api/activities/utils.py +++ b/agents-api/agents_api/activities/utils.py @@ -24,7 +24,7 @@ from ..autogen.openapi_model import SystemDef from ..common.nlp import nlp from ..common.utils import yaml -from .humanization_utils import process_paragraph, split_with_langchain +from .humanization_utils import humanize_paragraph, split_with_langchain # Security limits MAX_STRING_LENGTH = 1_000_000 # 1MB @@ -218,31 +218,32 @@ def safe_extract_json(string: str): def humanize_text( text: str, + threshold: float = 90, src_lang: str = "english", target_lang: str = "german", - grammar: bool = False, + grammar_check: bool = False, is_chatgpt: bool = True, use_homoglyphs: bool = True, use_em_dashes: bool = True, - max_tries: int = 10 + max_tries: int = 10, ) -> str: - humanized_text = "" paragraphs = split_with_langchain(text) for paragraph in paragraphs: - processed_paragraph = process_paragraph( + humanized_paragraph = humanize_paragraph( paragraph=paragraph, + threshold=threshold, src_lang=src_lang, target_lang=target_lang, - grammar=grammar, + grammar_check=grammar_check, is_chatgpt=is_chatgpt, use_homoglyphs=use_homoglyphs, use_em_dashes=use_em_dashes, - max_tries=max_tries + max_tries=max_tries, ) - humanized_text += processed_paragraph + "\n\n" + humanized_text += humanized_paragraph + "\n\n" return humanized_text