-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokens.py
75 lines (64 loc) · 2.23 KB
/
tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
A helper library for calculating the number of tokens in a text (string),
for different LLM providers and tokenization methods.
Also, this library provides helper functions for tokenizing a text (string)
"""
import os
import sys
import re
import tiktoken
import voyageai
import pandas as pd
from pprint import pprint
# Tokenization methods
def calc_tokens(text: str, method='tiktoken', model='gpt-3.5-turbo'):
"""
Calculate the number of tokens in a text (string)
:param text: a string
:param method: a string, the tokenization method
:return: an integer, the number of tokens
"""
if not text:
return 0
if method == 'tiktoken':
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
if method == 'anthropic':
vo = voyageai.Client()
return vo.count_tokens([text])
if method == 'word':
return len(re.findall(r'\b\w+\b', text))
if method == 'char':
return len(text)
else:
raise ValueError('Unknown tokenization method')
def fetch_texts(directory: str) -> list[str]:
"""
Fetch texts from a directory
:param directory: a string, the directory path
:return: a list of strings, the texts
"""
texts = []
for filename in os.listdir(directory):
print(f"Reading file: {filename}")
if filename.endswith('.txt'):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
texts.append(f.read())
return texts
def calculate_tokens(directory: str, method='tiktoken', model='gpt-3.5-turbo') -> dict[str, int]:
"""
Calculate the number of tokens in texts from a directory
:param directory: a string, the directory path
:param method: a string, the tokenization method
:return: a dictionary, the number of tokens in each text
"""
texts = fetch_texts(directory)
tokens = {}
for i, text in enumerate(texts):
tokens[f'text_{i}'] = calc_tokens(text, method, model)
return tokens
if __name__ == "__main__":
token_stats = calculate_tokens(sys.argv[1], method='tiktoken', model='gpt-3.5-turbo')
pprint(token_stats)
df = pd.DataFrame(token_stats.items(), columns=['Text', 'Tokens'])
print(df.describe())