package cn.keking.utils; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import org.mozilla.intl.chardet.nsDetector; import org.mozilla.intl.chardet.nsICharsetDetectionObserver; /** * 文本文件编码探测工具类 * * @author HWliao * @date 2017-12-24 */ public class FileCharsetDetector { /** * 传入一个文件(File)对象,检查文件编码 * * @param file File对象实例 * @return 文件编码,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public static Observer guessFileEncoding(File file) throws FileNotFoundException, IOException { return guessFileEncoding(file, new nsDetector()); } /** *
* 获取文件的编码 * @param file * File对象实例 * @param languageHint * 语言提示区域代码 @see #nsPSMDetector ,取值如下: * 1 : Japanese * 2 : Chinese * 3 : Simplified Chinese * 4 : Traditional Chinese * 5 : Korean * 6 : Dont know(default) ** * @return 文件编码,eg:UTF-8,GBK,GB2312形式(不确定的时候,返回可能的字符编码序列);若无,则返回null * @throws FileNotFoundException * @throws IOException */ public static Observer guessFileEncoding(File file, int languageHint) throws FileNotFoundException, IOException { return guessFileEncoding(file, new nsDetector(languageHint)); } /** * 获取文件的编码 * * @param file * @param det * @return * @throws FileNotFoundException * @throws IOException */ private static Observer guessFileEncoding(File file, nsDetector det) throws FileNotFoundException, IOException { // new Observer Observer observer = new Observer(); // set Observer // The Notify() will be called when a matching charset is found. det.Init(observer); BufferedInputStream imp = new BufferedInputStream(new FileInputStream( file)); byte[] buf = new byte[1024]; int len; boolean done = false; boolean isAscii = false; while ((len = imp.read(buf, 0, buf.length)) != -1) { // Check if the stream is only ascii. isAscii = det.isAscii(buf, len); if (isAscii) { break; } // DoIt if non-ascii and not done yet. done = det.DoIt(buf, len, false); if (done) { break; } } imp.close(); det.DataEnd(); if (isAscii) { observer.encoding = "ASCII"; observer.found = true; } if (!observer.isFound()) { String[] prob = det.getProbableCharsets(); // // 这里将可能的字符集组合起来返回 // for (int i = 0; i < prob.length; i++) { // if (i == 0) { // encoding = prob[i]; // } else { // encoding += "," + prob[i]; // } // } if (prob.length > 0) { // 在没有发现情况下,去第一个可能的编码 observer.encoding = prob[0]; } else { observer.encoding = null; } } return observer; } /** * @author liaohongwei * @Description: 文件字符编码观察者, 但判断出字符编码时候调用 * @date 2016年6月20日 下午2:27:06 */ public static class Observer implements nsICharsetDetectionObserver { /** * @Fields encoding : 字符编码 */ private String encoding = null; /** * @Fields found : 是否找到字符集 */ private boolean found = false; @Override public void Notify(String charset) { this.encoding = charset; this.found = true; } public String getEncoding() { return encoding; } public boolean isFound() { return found; } @Override public String toString() { return "Observer [encoding=" + encoding + ", found=" + found + "]"; } } }