Skip to content

Commit

Permalink
积累了一些小优化,小版本+1
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Sep 9, 2015
1 parent 4990d1c commit 27158e3
Show file tree
Hide file tree
Showing 5 changed files with 266 additions and 8 deletions.
11 changes: 10 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>1.2.4</version>
<version>1.2.5</version>

<name>HanLP</name>
<url>http://www.hankcs.com/</url>
Expand Down Expand Up @@ -79,6 +79,15 @@
</execution>
</executions>
</plugin>
<!-- failsafe插件,跳过测试-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.18.1</version>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
<properties>
Expand Down
114 changes: 114 additions & 0 deletions src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>2015/7/29 16:35</create-date>
*
* <copyright file="DumpReader.java" company="码农场">
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.io;

import java.io.File;
import java.io.FileFilter;

/**
* 文本读取工具
* @author hankcs
*/
public class EasyReader
{
/**
* 根目录
*/
String root;
/**
* 是否输出进度
*/
boolean verbose = true;

/**
* 构造
* @param root 根目录
*/
public EasyReader(String root)
{
this.root = root;
}

/**
* 构造
* @param root 根目录
* @param verbose 是否输出进度
*/
public EasyReader(String root, boolean verbose)
{
this.root = root;
this.verbose = verbose;
}

/**
* 读取
* @param handler 处理逻辑
* @param size 读取多少个文件
* @throws Exception
*/
public void read(LineHandler handler, int size) throws Exception
{
File rootFile = new File(root);
File[] files;
if (rootFile.isDirectory())
{
files = rootFile.listFiles(new FileFilter()
{
@Override
public boolean accept(File pathname)
{
return pathname.isFile() && !pathname.getName().endsWith(".bin");
}
});
if (files == null)
{
if (rootFile.isFile())
files = new File[]{rootFile};
else return;
}
}
else
{
files = new File[]{rootFile};
}

int n = 0;
int totalAddress = 0;
long start = System.currentTimeMillis();
for (File file : files)
{
if (size-- == 0) break;
if (file.isDirectory()) continue;
if (verbose) System.out.printf("正在处理%s, %d / %d\n", file.getName(), ++n, files.length);
IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(file.getAbsolutePath());
while (lineIterator.hasNext())
{
++totalAddress;
String line = lineIterator.next();
if (line.length() == 0) continue;
handler.handle(line);
}
}
handler.done();
if (verbose) System.out.printf("处理了 %.2f 万行,花费了 %.2f min\n", totalAddress / 10000.0, (System.currentTimeMillis() - start) / 1000.0 / 60.0);
}

/**
* 读取
* @param handler 处理逻辑
* @throws Exception
*/
public void read(LineHandler handler) throws Exception
{
read(handler, Integer.MAX_VALUE);
}
}
57 changes: 57 additions & 0 deletions src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -361,4 +361,61 @@ public void remove()
throw new UnsupportedOperationException("只读,不可写!");
}
}

/**
* 创建一个BufferedWriter
*
* @param path
* @return
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
public static BufferedWriter newBufferedWriter(String path) throws FileNotFoundException, UnsupportedEncodingException
{
return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"));
}

/**
* 创建一个BufferedReader
* @param path
* @return
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
public static BufferedReader newBufferedReader(String path) throws FileNotFoundException, UnsupportedEncodingException
{
return new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
}

public static BufferedWriter newBufferedWriter(String path, boolean append) throws FileNotFoundException, UnsupportedEncodingException
{
return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, append), "UTF-8"));
}

/**
* 获取最后一个分隔符的后缀
* @param name
* @param delimiter
* @return
*/
public static String getSuffix(String name, String delimiter)
{
return name.substring(name.lastIndexOf(delimiter) + 1);
}

/**
* 写数组,用制表符分割
* @param bw
* @param params
* @throws IOException
*/
public static void writeLine(BufferedWriter bw, String... params) throws IOException
{
for (int i = 0; i < params.length - 1; i++)
{
bw.write(params[i]);
bw.write('\t');
}
bw.write(params[params.length - 1]);
}
}
54 changes: 54 additions & 0 deletions src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>2015/7/29 16:37</create-date>
*
* <copyright file="DumpHander.java" company="码农场">
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.io;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

/**
* @author hankcs
*/
public abstract class LineHandler
{
String delimiter = "\t";

public LineHandler(String delimiter)
{
this.delimiter = delimiter;
}

public LineHandler()
{
}

public void handle(String line) throws Exception
{
List<String> tokenList = new LinkedList<String>();
int start = 0;
int end;
while ((end = line.indexOf(delimiter, start)) != -1)
{
tokenList.add(line.substring(start, end));
start = end + 1;
}
tokenList.add(line.substring(start, line.length()));
handle(tokenList.toArray(new String[0]));
}

public void done() throws IOException
{
// do noting
}

public abstract void handle(String[] params) throws IOException;
}
38 changes: 31 additions & 7 deletions src/test/java/com/hankcs/test/model/TestCRF.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.dictionary.EasyDictionary;
import com.hankcs.hanlp.corpus.document.CorpusLoader;
import com.hankcs.hanlp.corpus.document.Document;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.EasyReader;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.io.LineHandler;
import com.hankcs.hanlp.corpus.util.Precompiler;
import com.hankcs.hanlp.model.crf.FeatureFunction;
import com.hankcs.hanlp.model.crf.FeatureTemplate;
Expand Down Expand Up @@ -93,17 +97,19 @@ public void testSegment() throws Exception
*/
public void testPrepareCRFTrainingCorpus() throws Exception
{
final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014人民日报语料BMES切分.txt"), "UTF-8"));
CorpusLoader.walk("H:\\seg_corpus", new CorpusLoader.Handler()
final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("e:\\2014.txt"), "UTF-8"));
CorpusLoader.walk("D:\\Doc\\语料库\\2014_hankcs", new CorpusLoader.Handler()
{
@Override
public void handle(Document document)
{
try
{
List<List<Word>> sentenceList = document.getSimpleSentenceList();
if (sentenceList.size() == 0) return;
for (List<Word> sentence : sentenceList)
{
if (sentence.size() == 0) continue;
for (IWord iWord : sentence)
{
String word = iWord.getValue();
Expand All @@ -118,28 +124,28 @@ public void handle(Document document)
bw.write(word);
bw.write('\t');
bw.write('S');
bw.newLine();
bw.write('\n');
}
else
{
bw.write(word.charAt(0));
bw.write('\t');
bw.write('B');
bw.newLine();
bw.write('\n');
for (int i = 1; i < word.length() - 1; ++i)
{
bw.write(word.charAt(i));
bw.write('\t');
bw.write('M');
bw.newLine();
bw.write('\n');
}
bw.write(word.charAt(word.length() - 1));
bw.write('\t');
bw.write('E');
bw.newLine();
bw.write('\n');
}
}
bw.newLine();
bw.write('\n');
}
}
catch (IOException e)
Expand Down Expand Up @@ -187,4 +193,22 @@ public void testLoadModelWithBiGramFeature() throws Exception
model.tag(table);
System.out.println(table);
}

public void testRemoveSpace() throws Exception
{
String inputPath = "E:\\2014.txt";
String outputPath = "E:\\2014f.txt";
BufferedReader br = IOUtil.newBufferedReader(inputPath);
BufferedWriter bw = IOUtil.newBufferedWriter(outputPath);
String line = "";
int preLength = 0;
while ((line = br.readLine()) != null)
{
if (preLength == 0 && line.length() == 0) continue;
bw.write(line);
bw.newLine();
preLength = line.length();
}
bw.close();
}
}

0 comments on commit 27158e3

Please # to comment.