-
Notifications
You must be signed in to change notification settings - Fork 3
/
SVM_TODO.py
50 lines (37 loc) · 1.45 KB
/
SVM_TODO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
np.random.seed(42)
"""
from Karpathy:
https://twitter.com/karpathy/status/1647025230546886658
Q: Can this approach for finding "similar" embeddings also be transformed
to make a better classifier / regressor in high dimensional spaces?
"""
dim = 768
n = 1000
embeddings = np.random.randn(n, dim) # n documents, dim-dimensional embeddings
embeddings = embeddings / np.sqrt((embeddings**2).sum(1, keepdims=True)) # L2 normalize the rows, as is common
query = np.random.randn(dim) # the query vector
query = query / np.sqrt((query**2).sum())
# Tired: use kNN
similarities = embeddings.dot(query)
sorted_ix = np.argsort(-similarities)
print("top 10 results:")
for k in sorted_ix[:10]:
print(f"row {k}, similarity {similarities[k]}")
# Wired: use an SVM
from sklearn import svm
# create the "Dataset"
x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
y = np.zeros(n+1)
y[0] = 1 # we have a single positive example, mark it as such
# train SVM
# docs: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y) # train
# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(x)
sorted_ix = np.argsort(-similarities)
print("\nSVM:")
print("top 10 results:")
for k in sorted_ix[:10]:
print(f"row {k}, similarity {similarities[k]}")