-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathposting_list.cpp
182 lines (152 loc) · 4.46 KB
/
posting_list.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#include "posting_list.h"
posting_list::posting_list()
{
first = NULL;
word = NULL;
pl_size = 0;
}
void posting_list::print(bool full_print)
{
if (full_print == 0) //only print df
{
cout<<word<<" "<<pl_size<<endl;
return;
}
node * temp = first;
if (first == NULL)
{
cout<<"Posting list is empty"<<endl;
return;
}
cout<<"-> word:'"<<word<<"' "<<pl_size<<" ";
while(temp != NULL) //print whole posting list
{
cout<<"["<<temp->doc_id<<", "<<temp->freq<<"]"<<" ";
temp = temp->next;
}
cout<<endl;
}
unsigned int posting_list::get_size()
{
return pl_size; //df
}
posting_list::~posting_list()
{
node * temp = first;
node * temp2;
while(temp != NULL) //iterate list
{
temp2 = temp->next;
delete temp;
temp = temp2;
pl_size--;
}
free(word);
}
posting_list::node * posting_list::create_node(unsigned int doc_id)
{
node * new_node = new node;
new_node->doc_id = doc_id;
new_node->freq = 1;
new_node->next = NULL ;
return new_node;
}
void posting_list::update(char * word, unsigned int doc_id)
{ //create new list node if doc id is not contained in the list or increase its term frequency otherwise
// posting list is sorted
node * temp = first;
node * prev_temp;
node * n;
if(first == NULL)
{ // list is empty
n = create_node(doc_id);
first = n;
this->word = (char *) malloc((strlen(word) + 1) * sizeof(char));
strcpy(this->word, word);
pl_size++;
}
else
{ // list already exists
while(1)
{
if(temp == NULL)
{// we have iterated the whole list and doc id does not exist. so it will be added at the end of the list
n = create_node(doc_id);
pl_size++;
prev_temp->next = n;
break;
}
if (temp->doc_id == doc_id)
{ // doc id exists
temp->freq++; //increase its term frequency
break;
}
else if(temp->doc_id < doc_id)
{ //continue searching
prev_temp = temp;
temp = temp->next;
}
else
{//because the list is sorted and the current doc id is bigger than the inserting one
//we do not have to keep looking the rest of the list (the rest stored doc ids will be different than the inserting one)
// doc id will be added here
n = create_node(doc_id);
pl_size++;
n->next = temp;
if(prev_temp == NULL)
{ //we be added at the start of the list
first = n;
}
else
{
prev_temp->next = n;
}
break;
}
}
}
}
unsigned int posting_list::get_term_freq(unsigned int doc_id)
{
node * temp = first;
while(temp != NULL) //iterating posting list
{
if(temp->doc_id == doc_id)
{
return temp->freq; //found it
}
else if(temp->doc_id > doc_id)
{ // there is no point keep searching because list is sorted
return 0; //doc id not found
}
else
{
temp = temp->next;
}
}
return 0; // we have searched the whole list and doc id not found
}
unsigned int * posting_list::add_doc_id_to_array(unsigned int * unq_doc_ids, unsigned int & unq_doc_ids_size)
{// adding posting list's doc ids to an array. if a posting list's doc id is already contained in the array do not add it again
node * temp = first;
while(temp != NULL) //iterating posting list
{
bool doc_id_exists = 0;
for(unsigned int i = 0; i < unq_doc_ids_size; i++)
{ // checking if current doc id is already contained in the array
if (unq_doc_ids[i] == temp->doc_id)
{
doc_id_exists = 1; // already exists
break;
}
}
if(doc_id_exists == 0)
{ //if current doc id is not already contained in the array add it now
unq_doc_ids_size++;
unq_doc_ids = (unsigned int *) realloc(unq_doc_ids, unq_doc_ids_size * sizeof(unsigned int));
unq_doc_ids[unq_doc_ids_size - 1] = temp->doc_id;
}
temp = temp->next;
}
return unq_doc_ids;
}