package com.taover.util; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; public class ToolsCharset { /** * 检查是否utf编码文件 * @param dataFile * @return * @throws Exception */ public static boolean isUtf8File(File dataFile) throws Exception{ if(!dataFile.exists()) { throw new Exception("file not found"); } byte[] dataBuffer = readFileToByteArr(dataFile); return isUtf8(dataBuffer, dataBuffer.length); } /** * 检查是否gbk编码文件 * @param dataFile * @return * @throws Exception */ public static boolean isGBKFile(File dataFile) throws Exception{ if(!dataFile.exists()) { throw new Exception("file not found"); } byte[] dataBuffer = readFileToByteArr(dataFile); return isGBK(dataBuffer, dataBuffer.length); } private static byte[] readFileToByteArr(File dataFile) throws IOException { FileInputStream fileIS = new FileInputStream(dataFile); int estimateLen = (int)(fileIS.available() * 1.2); ByteArrayOutputStream arrStream = new ByteArrayOutputStream(estimateLen); byte[] buffer = null; try { int readLen = -1; buffer = new byte[estimateLen]; while((readLen = fileIS.read(buffer)) != -1) { arrStream.write(buffer, 0, readLen); } buffer = arrStream.toByteArray(); }catch (Exception e) { throw e; }finally { arrStream.close(); fileIS.close(); } return buffer; } /** * 字符集格式,参见: * http://jszx.cuit.edu.cn/NewsCont.asp?bm=00&type=1009&id=20575 * 参考实现: * https://www.cnblogs.com/Toney-01-22/p/9935297.html * @param contentByte * @return */ public static boolean isUtf8(byte[] contentByte, int length) { //System.out.println("DATA>>>\n"+Hex.encodeHexString(contentByte)); int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节 boolean bAllAscii = true; for (int i=0; i= 0x80) { if (chr >= 0xFC && chr <= 0xFD) { nBytes = 6; } else if (chr >= 0xF8) { nBytes = 5; } else if (chr >= 0xF0) { nBytes = 4; } else if (chr >= 0xE0) { nBytes = 3; } else if (chr >= 0xC0) { nBytes = 2; } else { return false; } nBytes--; } } else { //多字节符的非首字节,应为 10xxxxxx if ((chr & 0xC0) != 0x80) { return false; } //减到为零为止 nBytes--; } } //违返UTF8编码规则 if (nBytes != 0) { return false; } if (bAllAscii) { //如果全部都是ASCII, 也是UTF8 return true; } return true; } /** * 字符集格式,参见: * https://www.qqxiuzi.cn/zh/hanzi-gbk-bianma.php * 参考实现: * https://www.cnblogs.com/Toney-01-22/p/9935297.html * @param contentByte * @return */ public static boolean isGBK(byte[] contentByte, int length) { int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个 boolean bAllAscii = true; //如果全部都是ASCII, for (int i=0; i= 0x80) { if (chr >= 0x81 && chr <= 0xFE) { nBytes = +2; } else { return false; } nBytes--; } } else { if (chr < 0x40 || chr>0xFE) { return false; } nBytes--; }//else end } if (nBytes != 0) { //违返规则 return false; } if (bAllAscii) { //如果全部都是ASCII, 也是GBK return true; } return true; } }