Skip to content

Commit

Permalink
Merge pull request #39 from NREL/pp/dev
Browse files Browse the repository at this point in the history
Misc updates and bug fixes
  • Loading branch information
ppinchuk authored Nov 21, 2024
2 parents 2e4f268 + 09ca946 commit 1127b0e
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 11 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,10 @@ examples/research_hub/txt/
examples/research_hub/meta.csv
*ignore*.py

# pixi environments
.pixi
*.egg-info
pixi*

# Scratch
*scratch*/
26 changes: 23 additions & 3 deletions elm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,18 @@ class ApiBase(ABC):

TOKENIZER_ALIASES = {'gpt-35-turbo': 'gpt-3.5-turbo',
'gpt-4-32k': 'gpt-4-32k-0314',
'llmev-gpt-4-32k': 'gpt-4-32k-0314'
'llmev-gpt-4-32k': 'gpt-4-32k-0314',
'wetosa-gpt-4': 'gpt-4',
'wetosa-gpt-4-standard': 'gpt-4',
'wetosa-gpt-4o': 'gpt-4o',
}
"""Optional mappings for unusual Azure names to tiktoken/openai names."""

TOKENIZER_PATTERNS = ('gpt-4o', 'gpt-4-32k', 'gpt-4')
"""Order-prioritized list of model sub-strings to look for in model name
to send to tokenizer. As an alternative to alias lookup, this will use the
tokenizer pattern if found in the model string"""

def __init__(self, model=None):
"""
Parameters
Expand Down Expand Up @@ -345,7 +353,7 @@ def get_embedding(cls, text):
return embedding

@classmethod
def count_tokens(cls, text, model):
def count_tokens(cls, text, model, fallback_model='gpt-4'):
"""Return the number of tokens in a string.
Parameters
Expand All @@ -354,14 +362,26 @@ def count_tokens(cls, text, model):
Text string to get number of tokens for
model : str
specification of OpenAI model to use (e.g., "gpt-3.5-turbo")
fallback_model : str, default='gpt-4'
Model to be used for tokenizer if input model can't be found
in :obj:`TOKENIZER_ALIASES` and doesn't have any easily
noticeable patterns.
Returns
-------
n : int
Number of tokens in text
"""

token_model = cls.TOKENIZER_ALIASES.get(model, model)
if model in cls.TOKENIZER_ALIASES:
token_model = cls.TOKENIZER_ALIASES[model]
else:
token_model = fallback_model
for pattern in cls.TOKENIZER_PATTERNS:
if pattern in model:
token_model = pattern
break

encoding = tiktoken.encoding_for_model(token_model)

return len(encoding.encode(text))
Expand Down
3 changes: 1 addition & 2 deletions elm/ords/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ async def _docs_from_google_search(
return await google_results_as_docs(
queries,
num_urls=num_urls,
text_splitter=text_splitter,
browser_semaphore=browser_semaphore,
task_name=location.full_name,
**file_loader_kwargs,
Expand Down Expand Up @@ -135,7 +134,7 @@ async def _down_select_docs_correct_content(docs, location, **kwargs):

async def _contains_ords(doc, **kwargs):
"""Helper coroutine that checks for ordinance info. """
doc = check_for_ordinance_info(doc, **kwargs)
doc = await check_for_ordinance_info(doc, **kwargs)
return doc.metadata.get("contains_ord_info", False)


Expand Down
5 changes: 3 additions & 2 deletions elm/ords/extraction/ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
from nltk.util import ngrams


nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
STOP_WORDS = set(stopwords.words("english"))
PUNCTUATIONS = {'"', ".", "(", ")", ",", "?", ";", ":", "''", "``"}

Expand Down
4 changes: 2 additions & 2 deletions examples/ordinance_gpt/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@

# 2) Build coroutine first the use it to call async func
# (extract_ordinance_text_with_llm is an async function)
extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
doc = ARun.run(services, extrct)
extract = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
doc = ARun.run(services, extract)

# 3) Build coroutine and use it to call async func in one go
doc = ARun.run(services, extract_ordinance_values(doc, **kwargs))
Expand Down
2 changes: 1 addition & 1 deletion tests/ords/services/test_services_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async def _test_response(*args, **kwargs):
)
return sample_openai_response(kwargs=kwargs)

client = openai.AsyncOpenAI()
client = openai.AsyncOpenAI(api_key="dummy")
monkeypatch.setattr(
client.chat.completions,
"create",
Expand Down
2 changes: 1 addition & 1 deletion tests/ords/test_integrated.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ async def _test_response(*args, **kwargs):
)
return sample_openai_response()

client = openai.AsyncOpenAI()
client = openai.AsyncOpenAI(api_key="dummy")
monkeypatch.setattr(
client.chat.completions,
"create",
Expand Down

0 comments on commit 1127b0e

Please # to comment.