forked from Benmckenzie96/csce-482-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorg_dataset.py
128 lines (109 loc) · 4.29 KB
/
org_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
from org import Org
import pickle
import numpy as np
class OrgDataset:
"""This class contains functionality to create
a local database of organizations. The database is
a pandas dataframe under the hood.
Attributes:
attributes (list): a list of strings. Each entry
is a column label for the dataframe.
dataframe (DataFrame): the dataframe containing
all of the organization data. You shouldn't have
to directly interact with this. Accessing the dataframe
should be done using the functions of this class.
"""
def __init__(self):
"""Initializes an OrgDataset object with an
empty dataframe to store data. The dataframe
has the same columns as the values returned by
'attribute_labels' function found in 'Org' class.
"""
self.attributes = Org.attribute_labels()
self.dataframe = pd.DataFrame(columns=self.attributes)
def add_orgs(self, orgs):
"""Adds a list of 'Org' objects to the database.
Args:
orgs (list): a list of Org objects
"""
data = []
for org in orgs:
data.append(org.data())
new_orgs = pd.DataFrame(data=data, columns=self.attributes)
self.dataframe = pd.concat([self.dataframe, new_orgs])
def get_orgs_by_indices(self, indices):
"""Returns the rows of the dataframe with
indices equal to the supplied indices.
Args:
indices (list): A python list of integers.
Returns:
A subset of the dataframe containing all
organizations stored. All fields of the
dataframe are returned.
"""
return self.dataframe.iloc[indices]
def get_org_descriptions(self, indices=None):
"""
Args:
indices (list): an optional list of integers
representing which rows of the orgs dataframe
should have descriptions returned. If no value is
provided, all org descriptions are returned.
Returns:
a numpy array of strings. Each entry
is an 'Org' description. The description consists of
the 'Org' name concatenated with the 'Org' purpose.
"""
if indices is not None:
df = self.get_orgs_by_indices(indices)
else:
df = self.dataframe
return df['orgPurpose'].to_numpy()
def get_random_org_ids(self, num):
"""Fetches random org descriptions from the
dataset.
Args:
num (int): The number of random org ids to
fetch.
Returns:
a numpy array of strings containing the random
org ids.
"""
indices = np.random.randint(0, self.dataframe.shape[0], num)
return self.get_orgs_by_indices(indices)['orgId'].to_numpy()
def get_orgs_by_id(self, ids, only_desc=False):
"""Gets the rows of the database dataframe with
orgId equal to the supplied values.
Args:
ids (list): a list (or np array) of strings
where each entry is an organization id.
only_desc (bool, optional): defaults to False.
if set to True, rather than dataframe rows
being returned, only a list containing each
organization's description will be returned.
Returns:
One of the following depending on the value for
only_desc:
False: a dataframe containing the orgs with ids
equal to ids.
True: an array containing only the organizations'
description strings.
"""
df = self.dataframe.loc[self.dataframe['orgId'].isin(ids)]
if only_desc:
return df['orgPurpose'].to_numpy()
else:
return df
def save_instance(self, destination):
"""Saves OrgDataset instance to a pickle file
in the specified destination.
Args:
destination (str): The destination to save
This OrgDataset instance to.
"""
with open(destination, 'wb') as f:
pickle.dump(self, f)
def load_instance(location):
with open(location, 'rb') as f:
return pickle.load(f)