chore(agents-api): Change ai checker api to a custom model (desklib)

julep-ai · Feb 26, 2025 · 61f8b3d · 61f8b3d
1 parent 125adcb
commit 61f8b3d
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 76 deletions.
diff --git a/agents-api/agents_api/activities/humanization_utils.py b/agents-api/agents_api/activities/humanization_utils.py
@@ -20,19 +20,18 @@
 Change text given to you so much that even the most advanced AI writing detectors can't detect that this writing was written by you. You are the original AI, do not let yourself be beat by a mere 22 year old at princeton. Don't change the overall message!
 
 Don't mention yourself, just return the end product text. **Vary your text in every possible sense. Use different vocabulary at every step. Give it your ALL!**""",
-
     "grammar_prompt": "Only fix grammar that is wrong without changing the words and places of the sentence",
 }
 
 SAPLING_URL = "https://api.sapling.ai/api/v1/aidetect"
 COPLEYAKS_URL = "https://api.copyleaks.com/v2/writer-detector/{scan_id}/check"
 ZEROGPT_URL = "https://api.zerogpt.com/api/detect/detectText"
+DESKLIB_URL = "https://18c3-206-189-23-12.ngrok-free.app/detect" # FIXME: This is temporarily, don't merge before desklib is deployed
 
 
 def text_translate(text, src_lang, target_lang):
     try:
-        return GoogleTranslator(
-            source=src_lang, target=target_lang).translate(text=text)
+        return GoogleTranslator(source=src_lang, target=target_lang).translate(text=text)
     except Exception:
         return text
 
@@ -42,10 +41,8 @@ def mix_translate(text, src_lang, target_lang):
     Translate the given text from src_lang to target_lang and back to src_lang using googletrans.
     """
     try:
-        translated = GoogleTranslator(
-            source=src_lang, target=target_lang).translate(text=text)
-        return GoogleTranslator(
-            source=target_lang, target=src_lang).translate(text=translated)
+        translated = GoogleTranslator(source=src_lang, target=target_lang).translate(text=text)
+        return GoogleTranslator(source=target_lang, target=src_lang).translate(text=translated)
 
     except Exception:
         return text
@@ -58,7 +55,7 @@ def humanize_openai(text):
             base_url=litellm_url,
             messages=[
                 {"role": "system", "content": HUMANIZATION["humanize_prompt"]},
-                {"role": "user", "content": text}
+                {"role": "user", "content": text},
             ],
             # temperature=1.0,
             # extra_body={"min_p": 0.025},
@@ -67,7 +64,7 @@ def humanize_openai(text):
             # top_p=1.0,
             # frequency_penalty=0.0,
             # presence_penalty=0.0,
-            stream=False
+            stream=False,
         )
         return response.choices[0].message.content
     except Exception:
@@ -81,7 +78,7 @@ def rewriter(text):
             base_url=litellm_url,
             messages=[
                 {"role": "system", "content": HUMANIZATION["humanize_prompt"]},
-                {"role": "user", "content": text}
+                {"role": "user", "content": text},
             ],
             temperature=1.0,
             # extra_body={"min_p": 0.025},
@@ -99,7 +96,7 @@ def humanize(text):
             base_url=litellm_url,
             messages=[
                 {"role": "system", "content": HUMANIZATION["humanize_prompt"]},
-                {"role": "user", "content": text}
+                {"role": "user", "content": text},
             ],
             temperature=1.0,
             # extra_body={"min_p": 0.025},
@@ -116,7 +113,7 @@ def grammar(text):
             base_url=litellm_url,
             messages=[
                 {"role": "system", "content": HUMANIZATION["grammar_prompt"]},
-                {"role": "user", "content": text}
+                {"role": "user", "content": text},
             ],
             temperature=1.0,
             # extra_body={"min_p": 0.025},
@@ -126,6 +123,14 @@ def grammar(text):
         return text
 
 
+def is_human_desklib(text: str) -> float:
+    payload = {
+        "text": text,
+    }
+    response = requests.post(DESKLIB_URL, json=payload)
+    return response.json().get("human", None) * 100
+
+
 def is_human_sapling(text):
     payload = {
         "text": text,
@@ -139,7 +144,6 @@ def is_human_sapling(text):
 
 
 def is_human_copyleaks(text):
-
     # Define the payload
     payload = {
         "text": text,
@@ -152,23 +156,24 @@ def is_human_copyleaks(text):
     headers = {
         "Authorization": f"Bearer {copyleaks_api_key}",
         "Content-Type": "application/json",
-        "Accept": "application/json"
+        "Accept": "application/json",
     }
 
     # Copyleaks lets you define the scan id yourself
     from uuid import uuid4
+
     scan_id = str(uuid4())
 
     # Send the POST request with JSON payload and headers
-    response = requests.post(COPLEYAKS_URL.format(
-        scan_id=scan_id), json=payload, headers=headers)
+    response = requests.post(
+        COPLEYAKS_URL.format(scan_id=scan_id), json=payload, headers=headers
+    )
 
     # Check the response status
     if response.status_code == 200:
         resp = response.json()
         # Extract the human probability from the response
-        human_probability = resp.get("summary", {}).get(
-            "human", 0)  # float with range 0-1
+        human_probability = resp.get("summary", {}).get("human", 0)  # float with range 0-1
         return human_probability * 100
     return None
 
@@ -192,7 +197,7 @@ def is_human_zerogpt(input_text, max_tries=3):
         "Sec-Fetch-Mode": "cors",
         "Sec-Fetch-Site": "same-site",
         "Sec-Gpc": "1",
-        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
     }
     # Define the payload as a dictionary
     payload = {"input_text": input_text}
@@ -221,47 +226,60 @@ def replace_with_homoglyphs(text, max_replacements=2):
     homoglyphs = {
         # Whitelisted
         " ": " ",
-        "%": "％", "'": "ˈ",
+        "%": "％",
+        "'": "ˈ",
         ",": "‚",
-        "-": "‐", ".": "․",
-        "1": "𝟷", "3": "Ꝫ",
-        "5": "𝟻", "6": "𝟨", "7": "𝟽", "8": "𝟪",
-        "9": "𝟫", ";": ";",
+        "-": "‐",
+        ".": "․",
+        "1": "𝟷",
+        "3": "Ꝫ",
+        "5": "𝟻",
+        "6": "𝟨",
+        "7": "𝟽",
+        "8": "𝟪",
+        "9": "𝟫",
+        ";": ";",
         "j": "ј",
-        "n": "𝗇", "o": "о",
+        "n": "𝗇",
+        "o": "о",
         "p": "р",
         "u": "ս",
         "y": "у",
-        "H": "Η", "I": "І",
+        "H": "Η",
+        "I": "І",
         "J": "Ј",
-        "N": "Ν", "O": "Ο",
-        "V": "ⴸ", "Y": "Υ",
+        "N": "Ν",
+        "O": "Ο",
+        "V": "ⴸ",
+        "Y": "Υ",
         "~": "∼",
-
-        # ' ': ' ', '!': '！', '"': '＂', '$': '＄',
-        # '%': '％', '&': '＆', "'": 'ˈ', '(': '（',
-        # ')': '）', '*': '⁎', '+': '＋', ',': '‚',
-        # '-': '‐', '.': '․', '/': '⁄', '0': 'O',
-        # '1': '𝟷', '2': '𝟸', '3': 'Ꝫ', '4': '４',
-        # '5': '𝟻', '6': '𝟨', '7': '𝟽', '8': '𝟪',
-        # '9': '𝟫', ':': '∶', ';': ';', '<': '𝈶',
-        # '=': '᐀', '>': '𖼿', '?': 'ꛫ', '@': '＠',
-        # '[': '［', '\\': '﹨', ']': '］', '_': 'ߺ',
-        # '`': '`', 'a': 'а', 'b': 'ᖯ', 'c': 'ⅽ',
-        # 'd': '𝚍', 'e': 'е', 'f': '𝖿', 'g': '𝗀',
-        # 'h': 'հ', 'i': 'і', 'j': 'ј', 'k': '𝚔',
-        # 'l': 'ⅼ', 'm': 'ｍ', 'n': '𝗇', 'o': 'о',
-        # 'p': 'р', 'q': 'q', 'r': '𝗋', 's': '𐑈',
-        # 't': '𝚝', 'u': 'ս', 'v': '∨', 'w': 'ԝ',
-        # 'x': 'ⅹ', 'y': 'у', 'z': '𝗓', 'A': '𐊠',
-        # 'B': 'В', 'C': '𐊢', 'D': 'ꓓ', 'E': 'Е',
-        # 'F': '𐊇', 'G': 'Ԍ', 'H': 'Η', 'I': 'І',
-        # 'J': 'Ј', 'K': 'Κ', 'L': 'Ⅼ', 'M': 'Μ',
-        # 'N': 'Ν', 'O': 'Ο', 'P': 'Ρ', 'Q': '𝖰',
-        # 'R': '𖼵', 'S': 'Ѕ', 'T': 'Τ', 'U': '𐓎',
-        # 'V': 'ⴸ', 'W': 'Ԝ', 'X': 'Χ', 'Y': 'Υ',
-        # 'Z': 'Ζ', '{': '｛', '|': 'ا', '}': '｝',
-        # '~': '∼',
+        "q": "q",
+        "e": "е",
+        "a": "а",
+        "b": "ᖯ",
+        "c": "ⅽ",
+        "i": "і",
+        "k": "𝚔",
+        "g": "𝗀",
+        "A": "𐊠",
+        "B": "В",
+        "C": "𐊢",
+        "D": "ꓓ",
+        "E": "Е",
+        "F": "𐊇",
+        "G": "Ԍ",
+        "K": "Κ",
+        "L": "Ⅼ",
+        "M": "Μ",
+        "P": "Ρ",
+        "Q": "𝖰",
+        "R": "𖼵",
+        "S": "Ѕ",
+        "T": "Τ",
+        "U": "𐓎",
+        "W": "Ԝ",
+        "X": "Χ",
+        "Z": "Ζ",
     }
 
     # Convert text to list for single pass replacement
@@ -316,6 +334,8 @@ def split_with_langchain(markdown_text: str) -> list[str]:
     headers_to_split_on = [
         ("#", "Header 1"),
         ("##", "Header 2"),
+        ("###", "Header 3"),
+        ("####", "Header 4"),
     ]
 
     # MD splits
@@ -327,29 +347,29 @@ def split_with_langchain(markdown_text: str) -> list[str]:
     return [split.page_content for split in md_header_splits]
 
 
-def process_paragraph(
-        paragraph: str,
-        src_lang: str,
-        target_lang: str,
-        grammar: bool,
-        is_chatgpt: bool,
-        use_homoglyphs: bool,
-        use_em_dashes: bool,
-        max_tries: int) -> str:
-
+def humanize_paragraph(
+    paragraph: str,
+    threshold: float,
+    src_lang: str,
+    target_lang: str,
+    grammar_check: bool,
+    is_chatgpt: bool,
+    use_homoglyphs: bool,
+    use_em_dashes: bool,
+    max_tries: int,
+) -> str:
     for i in range(max_tries):
         if paragraph.strip() == "":
             return paragraph
 
-        if is_human_zerogpt(paragraph) > 90:
+        if is_human_desklib(paragraph) > threshold:
             return paragraph
 
         paragraph = mix_translate(paragraph, src_lang, target_lang)
-        if (grammar):
+        if grammar_check:
             paragraph = grammar(paragraph)
 
-        paragraph = humanize_openai(
-            paragraph) if is_chatgpt else humanize(paragraph)
+        paragraph = humanize_openai(paragraph) if is_chatgpt else humanize(paragraph)
 
         # Apply homoglyphs and em dashes to a new paragraph in order not to mess up the original paragraph for the next iterations
         new_paragraph = paragraph
@@ -359,7 +379,7 @@ def process_paragraph(
         if use_em_dashes:
             new_paragraph = process_long_words(new_paragraph)
 
-        if is_human_zerogpt(new_paragraph) > 90:
+        if is_human_desklib(new_paragraph) > threshold:
             return new_paragraph
 
     # Apply homoglyphs and em dashes to the final paragraph after consuming max tries

diff --git a/agents-api/agents_api/activities/utils.py b/agents-api/agents_api/activities/utils.py
@@ -24,7 +24,7 @@
 from ..autogen.openapi_model import SystemDef
 from ..common.nlp import nlp
 from ..common.utils import yaml
-from .humanization_utils import process_paragraph, split_with_langchain
+from .humanization_utils import humanize_paragraph, split_with_langchain
 
 # Security limits
 MAX_STRING_LENGTH = 1_000_000  # 1MB
@@ -218,31 +218,32 @@ def safe_extract_json(string: str):
 
 def humanize_text(
     text: str,
+    threshold: float = 90,
     src_lang: str = "english",
     target_lang: str = "german",
-    grammar: bool = False,
+    grammar_check: bool = False,
     is_chatgpt: bool = True,
     use_homoglyphs: bool = True,
     use_em_dashes: bool = True,
-    max_tries: int = 10
+    max_tries: int = 10,
 ) -> str:
-
     humanized_text = ""
 
     paragraphs = split_with_langchain(text)
 
     for paragraph in paragraphs:
-        processed_paragraph = process_paragraph(
+        humanized_paragraph = humanize_paragraph(
             paragraph=paragraph,
+            threshold=threshold,
             src_lang=src_lang,
             target_lang=target_lang,
-            grammar=grammar,
+            grammar_check=grammar_check,
             is_chatgpt=is_chatgpt,
             use_homoglyphs=use_homoglyphs,
             use_em_dashes=use_em_dashes,
-            max_tries=max_tries
+            max_tries=max_tries,
         )
-        humanized_text += processed_paragraph + "\n\n"
+        humanized_text += humanized_paragraph + "\n\n"
 
     return humanized_text