-
Notifications
You must be signed in to change notification settings - Fork 10.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
266 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 114 additions & 0 deletions
114
src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
/* | ||
* <summary></summary> | ||
* <author>He Han</author> | ||
* <email>me@hankcs.com</email> | ||
* <create-date>2015/7/29 16:35</create-date> | ||
* | ||
* <copyright file="DumpReader.java" company="码农场"> | ||
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/ | ||
* This source is subject to Hankcs. Please contact Hankcs to get more information. | ||
* </copyright> | ||
*/ | ||
package com.hankcs.hanlp.corpus.io; | ||
|
||
import java.io.File; | ||
import java.io.FileFilter; | ||
|
||
/** | ||
* 文本读取工具 | ||
* @author hankcs | ||
*/ | ||
public class EasyReader | ||
{ | ||
/** | ||
* 根目录 | ||
*/ | ||
String root; | ||
/** | ||
* 是否输出进度 | ||
*/ | ||
boolean verbose = true; | ||
|
||
/** | ||
* 构造 | ||
* @param root 根目录 | ||
*/ | ||
public EasyReader(String root) | ||
{ | ||
this.root = root; | ||
} | ||
|
||
/** | ||
* 构造 | ||
* @param root 根目录 | ||
* @param verbose 是否输出进度 | ||
*/ | ||
public EasyReader(String root, boolean verbose) | ||
{ | ||
this.root = root; | ||
this.verbose = verbose; | ||
} | ||
|
||
/** | ||
* 读取 | ||
* @param handler 处理逻辑 | ||
* @param size 读取多少个文件 | ||
* @throws Exception | ||
*/ | ||
public void read(LineHandler handler, int size) throws Exception | ||
{ | ||
File rootFile = new File(root); | ||
File[] files; | ||
if (rootFile.isDirectory()) | ||
{ | ||
files = rootFile.listFiles(new FileFilter() | ||
{ | ||
@Override | ||
public boolean accept(File pathname) | ||
{ | ||
return pathname.isFile() && !pathname.getName().endsWith(".bin"); | ||
} | ||
}); | ||
if (files == null) | ||
{ | ||
if (rootFile.isFile()) | ||
files = new File[]{rootFile}; | ||
else return; | ||
} | ||
} | ||
else | ||
{ | ||
files = new File[]{rootFile}; | ||
} | ||
|
||
int n = 0; | ||
int totalAddress = 0; | ||
long start = System.currentTimeMillis(); | ||
for (File file : files) | ||
{ | ||
if (size-- == 0) break; | ||
if (file.isDirectory()) continue; | ||
if (verbose) System.out.printf("正在处理%s, %d / %d\n", file.getName(), ++n, files.length); | ||
IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(file.getAbsolutePath()); | ||
while (lineIterator.hasNext()) | ||
{ | ||
++totalAddress; | ||
String line = lineIterator.next(); | ||
if (line.length() == 0) continue; | ||
handler.handle(line); | ||
} | ||
} | ||
handler.done(); | ||
if (verbose) System.out.printf("处理了 %.2f 万行,花费了 %.2f min\n", totalAddress / 10000.0, (System.currentTimeMillis() - start) / 1000.0 / 60.0); | ||
} | ||
|
||
/** | ||
* 读取 | ||
* @param handler 处理逻辑 | ||
* @throws Exception | ||
*/ | ||
public void read(LineHandler handler) throws Exception | ||
{ | ||
read(handler, Integer.MAX_VALUE); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/* | ||
* <summary></summary> | ||
* <author>He Han</author> | ||
* <email>me@hankcs.com</email> | ||
* <create-date>2015/7/29 16:37</create-date> | ||
* | ||
* <copyright file="DumpHander.java" company="码农场"> | ||
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/ | ||
* This source is subject to Hankcs. Please contact Hankcs to get more information. | ||
* </copyright> | ||
*/ | ||
package com.hankcs.hanlp.corpus.io; | ||
|
||
import java.io.IOException; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
|
||
/** | ||
* @author hankcs | ||
*/ | ||
public abstract class LineHandler | ||
{ | ||
String delimiter = "\t"; | ||
|
||
public LineHandler(String delimiter) | ||
{ | ||
this.delimiter = delimiter; | ||
} | ||
|
||
public LineHandler() | ||
{ | ||
} | ||
|
||
public void handle(String line) throws Exception | ||
{ | ||
List<String> tokenList = new LinkedList<String>(); | ||
int start = 0; | ||
int end; | ||
while ((end = line.indexOf(delimiter, start)) != -1) | ||
{ | ||
tokenList.add(line.substring(start, end)); | ||
start = end + 1; | ||
} | ||
tokenList.add(line.substring(start, line.length())); | ||
handle(tokenList.toArray(new String[0])); | ||
} | ||
|
||
public void done() throws IOException | ||
{ | ||
// do noting | ||
} | ||
|
||
public abstract void handle(String[] params) throws IOException; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters