Merge pull request #39 from NREL/pp/dev

Misc updates and bug fixes
NREL · Nov 21, 2024 · 1127b0e · 1127b0e
2 parents 2e4f268 + 09ca946
commit 1127b0e
Show file tree

Hide file tree

Showing 7 changed files with 38 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -124,3 +124,10 @@ examples/research_hub/txt/
 examples/research_hub/meta.csv
 *ignore*.py
 
+# pixi environments
+.pixi
+*.egg-info
+pixi*
+
+# Scratch
+*scratch*/
diff --git a/elm/base.py b/elm/base.py
@@ -46,10 +46,18 @@ class ApiBase(ABC):
 
     TOKENIZER_ALIASES = {'gpt-35-turbo': 'gpt-3.5-turbo',
                          'gpt-4-32k': 'gpt-4-32k-0314',
-                         'llmev-gpt-4-32k': 'gpt-4-32k-0314'
+                         'llmev-gpt-4-32k': 'gpt-4-32k-0314',
+                         'wetosa-gpt-4': 'gpt-4',
+                         'wetosa-gpt-4-standard': 'gpt-4',
+                         'wetosa-gpt-4o': 'gpt-4o',
                          }
     """Optional mappings for unusual Azure names to tiktoken/openai names."""
 
+    TOKENIZER_PATTERNS = ('gpt-4o', 'gpt-4-32k', 'gpt-4')
+    """Order-prioritized list of model sub-strings to look for in model name
+    to send to tokenizer. As an alternative to alias lookup, this will use the
+    tokenizer pattern if found in the model string"""
+
     def __init__(self, model=None):
         """
         Parameters
@@ -345,7 +353,7 @@ def get_embedding(cls, text):
         return embedding
 
     @classmethod
-    def count_tokens(cls, text, model):
+    def count_tokens(cls, text, model, fallback_model='gpt-4'):
         """Return the number of tokens in a string.
 
         Parameters
@@ -354,14 +362,26 @@ def count_tokens(cls, text, model):
             Text string to get number of tokens for
         model : str
             specification of OpenAI model to use (e.g., "gpt-3.5-turbo")
+        fallback_model : str, default='gpt-4'
+            Model to be used for tokenizer if input model can't be found
+            in :obj:`TOKENIZER_ALIASES` and doesn't have any easily
+            noticeable patterns.
 
         Returns
         -------
         n : int
             Number of tokens in text
         """
 
-        token_model = cls.TOKENIZER_ALIASES.get(model, model)
+        if model in cls.TOKENIZER_ALIASES:
+            token_model = cls.TOKENIZER_ALIASES[model]
+        else:
+            token_model = fallback_model
+            for pattern in cls.TOKENIZER_PATTERNS:
+                if pattern in model:
+                    token_model = pattern
+                    break
+
         encoding = tiktoken.encoding_for_model(token_model)
 
         return len(encoding.encode(text))

diff --git a/elm/ords/download.py b/elm/ords/download.py
@@ -103,7 +103,6 @@ async def _docs_from_google_search(
     return await google_results_as_docs(
         queries,
         num_urls=num_urls,
-        text_splitter=text_splitter,
         browser_semaphore=browser_semaphore,
         task_name=location.full_name,
         **file_loader_kwargs,
@@ -135,7 +134,7 @@ async def _down_select_docs_correct_content(docs, location, **kwargs):
 
 async def _contains_ords(doc, **kwargs):
     """Helper coroutine that checks for ordinance info. """
-    doc = check_for_ordinance_info(doc, **kwargs)
+    doc = await check_for_ordinance_info(doc, **kwargs)
     return doc.metadata.get("contains_ord_info", False)
 
 

diff --git a/elm/ords/extraction/ngrams.py b/elm/ords/extraction/ngrams.py
@@ -11,8 +11,9 @@
 from nltk.util import ngrams
 
 
-nltk.download("punkt")
-nltk.download("stopwords")
+nltk.download("punkt", quiet=True)
+nltk.download("punkt_tab", quiet=True)
+nltk.download("stopwords", quiet=True)
 STOP_WORDS = set(stopwords.words("english"))
 PUNCTUATIONS = {'"', ".", "(", ")", ",", "?", ";", ":", "''", "``"}
 

diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py
@@ -59,8 +59,8 @@
 
     # 2) Build coroutine first the use it to call async func
     # (extract_ordinance_text_with_llm is an async function)
-    extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
-    doc = ARun.run(services, extrct)
+    extract = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
+    doc = ARun.run(services, extract)
 
     # 3) Build coroutine and use it to call async func in one go
     doc = ARun.run(services, extract_ordinance_values(doc, **kwargs))

diff --git a/tests/ords/services/test_services_openai.py b/tests/ords/services/test_services_openai.py
@@ -67,7 +67,7 @@ async def _test_response(*args, **kwargs):
             )
         return sample_openai_response(kwargs=kwargs)
 
-    client = openai.AsyncOpenAI()
+    client = openai.AsyncOpenAI(api_key="dummy")
     monkeypatch.setattr(
         client.chat.completions,
         "create",

diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py
@@ -77,7 +77,7 @@ async def _test_response(*args, **kwargs):
             )
         return sample_openai_response()
 
-    client = openai.AsyncOpenAI()
+    client = openai.AsyncOpenAI(api_key="dummy")
     monkeypatch.setattr(
         client.chat.completions,
         "create",