Skip to content

Commit

Permalink
(doc) Switch from mozilla chardet to ibm-icu for character detection
Browse files Browse the repository at this point in the history
  • Loading branch information
slachiewicz committed Dec 14, 2021
1 parent 9b6f849 commit e8cbaca
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 118 deletions.
9 changes: 5 additions & 4 deletions archetype-common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,6 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>net.sourceforge.jchardet</groupId>
<artifactId>jchardet</artifactId>
</dependency>
<dependency>
<groupId>org.codehaus.plexus</groupId>
<artifactId>plexus-component-annotations</artifactId>
Expand Down Expand Up @@ -178,6 +174,11 @@
<artifactId>xmlunit-matchers</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>70.1</version>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,142 +19,49 @@
* under the License.
*/

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.codehaus.plexus.logging.AbstractLogEnabled;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;

/**
* @author rafale
*/
public class FileCharsetDetector
extends AbstractLogEnabled
{
private String charset = null;

private boolean found = false;
private final String charset;

public FileCharsetDetector( File detectedFile )
throws FileNotFoundException, IOException
throws IOException
{
nsDetector det = new nsDetector( nsPSMDetector.ALL );

det.Init( new nsICharsetDetectionObserver()
{
@Override
@SuppressWarnings( "checkstyle:methodname" )
public void Notify( String charset )
{
FileCharsetDetector.this.charset = charset;
FileCharsetDetector.this.found = true;
}
} );

try ( FileInputStream fileInputStream = new FileInputStream( detectedFile );
BufferedInputStream imp = new BufferedInputStream( fileInputStream ) )
BufferedInputStream is = new BufferedInputStream( fileInputStream ) )
{
byte[] buf = new byte[1024];
int len;
boolean done = false;
boolean isAscii = true;

while ( ( len = imp.read( buf, 0, buf.length ) ) != -1 )
{
// Check if the stream is only ascii.
if ( isAscii )
{
isAscii = det.isAscii( buf, len );
}

// DoIt if non-ascii and not done yet.
if ( !isAscii && !done )
{
done = det.DoIt( buf, len, false );
found = done;
}
}
det.DataEnd();
CharsetDetector detector = new CharsetDetector();
detector.setText( is );
CharsetMatch match = detector.detect();

if ( !isFound() )
{
String[] prob = det.getProbableCharsets();

if ( prob.length > 0 )
{
charset = prob[0];
}
}

if ( isAscii )
{
charset = "ASCII";
}
charset = match.getName().toUpperCase( Locale.ENGLISH );
}
}



public FileCharsetDetector( InputStream detectedStream )
throws FileNotFoundException, IOException
throws IOException
{
nsDetector det = new nsDetector( nsPSMDetector.ALL );

det.Init( new nsICharsetDetectionObserver()
{
@Override
@SuppressWarnings( "checkstyle:methodname" )
public void Notify( String charset )
{
FileCharsetDetector.this.charset = charset;
FileCharsetDetector.this.found = true;
}
} );

BufferedInputStream imp = new BufferedInputStream( detectedStream );
CharsetDetector detector = new CharsetDetector();
detector.setText( detectedStream );
CharsetMatch match = detector.detect();

byte[] buf = new byte[1024];
int len;
boolean done = false;
boolean isAscii = true;

while ( ( len = imp.read( buf, 0, buf.length ) ) != -1 )
{
// Check if the stream is only ascii.
if ( isAscii )
{
isAscii = det.isAscii( buf, len );
}

// DoIt if non-ascii and not done yet.
if ( !isAscii && !done )
{
done = det.DoIt( buf, len, false );
found = done;
}
}
det.DataEnd();

if ( !isFound() )
{
String[] prob = det.getProbableCharsets();

if ( prob.length > 0 )
{
charset = prob[0];
}
}

if ( isAscii )
{
charset = "ASCII";
}
charset = match.getName().toUpperCase( Locale.ENGLISH );
}

public String getCharset()
Expand All @@ -164,6 +71,6 @@ public String getCharset()

public boolean isFound()
{
return found;
return true;
}
}
8 changes: 3 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,6 @@
<artifactId>velocity</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>net.sourceforge.jchardet</groupId>
<artifactId>jchardet</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
Expand Down Expand Up @@ -269,6 +264,9 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.3.1</version>
<configuration>
<failOnWarnings>false</failOnWarnings>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
Expand Down

0 comments on commit e8cbaca

Please # to comment.