-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDocumentParser.java
125 lines (110 loc) · 3.81 KB
/
DocumentParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package ir_pa.project;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Date;
import java.util.UUID;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.jsoup.Jsoup;
public class DocumentParser {
private Path indexPath;
private File initFile;
private IndexWriter indexWriter;
public static enum FileType {
HTML, TXT
}
public DocumentParser(File initFile, IndexWriter indexWriter) {
if (initFile == null) {
writeError(new NullPointerException("initFile is null"));
System.exit(-1);
}
if (indexWriter == null) {
writeError(new NullPointerException("indexWriter is null"));
System.exit(-1);
}
this.initFile = initFile;
this.indexWriter = indexWriter;
this.indexPath = initFile.toPath().getParent();
}
public void parseDocuments() {
try {
indexFiles(this.initFile);
} catch (IOException ex) {
writeError(ex.getClass().toString(), ex.getMessage());
} catch (Exception ex) {
writeError(ex.getClass().toString(), ex.getMessage());
}
}
public Path getIndexPath() {
return this.indexPath;
}
public void closeIndexWriter() throws IOException {
this.indexWriter.close();
}
private void indexFiles(File source) throws IOException {
if (source.isDirectory()) {
for (File file : source.listFiles()) {
indexFiles(file);
}
} else {
String filename = source.getName().toLowerCase();
if (filename.endsWith(FileType.HTML.toString().toLowerCase())) {
addFile(source, FileType.HTML);
} else if (filename.endsWith(FileType.TXT.toString().toLowerCase())) {
addFile(source, FileType.TXT);
} else {
System.out.println(
source.getAbsolutePath() + " is not supported for indexing. Error: Unspported file format");
}
}
}
private void addFile(File source, FileType fileType) {
FileReader fileReader = null;
String filepath = source.getAbsolutePath();
org.apache.lucene.document.Document indexDoc = new org.apache.lucene.document.Document();
String date = new Date(source.lastModified()).toString();
try {
indexDoc.add(new StringField("filepath", filepath, Field.Store.YES));
indexDoc.add(new StringField("index", UUID.randomUUID().toString(), Field.Store.YES));
indexDoc.add(new StringField("date", date, Field.Store.YES));
switch (fileType) {
case HTML:
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(source, "UTF-8");
String title = htmlDoc.getElementsByTag("title").text();
String summary = !(htmlDoc.getElementsByTag("summary").isEmpty())
? htmlDoc.getElementsByTag("summary").text()
: htmlDoc.body().text().substring(0, 50);
String contents = htmlDoc.body().text();
indexDoc.add(new TextField("title", title, Field.Store.YES));
indexDoc.add(new TextField("summary", summary, Field.Store.YES));
indexDoc.add(new TextField("contents", contents, Field.Store.NO));
break;
case TXT:
try {
fileReader = new FileReader(source);
indexDoc.add(new TextField("contents", fileReader));
} catch (IOException ex) {
System.out.println("Error reading file: " + filepath + ", error: " + ex.getMessage());
throw (ex);
}
break;
}
this.indexWriter.addDocument(indexDoc);
System.out.println(filepath + " indexed successfully");
if (fileReader != null)
fileReader.close();
} catch (IOException ex) {
System.out.println(filepath + " could not be added. Error: " + ex.getMessage());
}
}
private void writeError(Exception ex) {
System.out.println("Error Type: " + ex.getClass() + "\nError Message: " + ex.getMessage());
}
private void writeError(String errorType, String errorMessage) {
System.out.println("Error Type: " + errorType + "\nError Message: " + errorMessage);
}
}