ToolsCharset.java 4.64 KB
package com.taover.util;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

public class ToolsCharset {
	/**
	 * 检查是否utf编码文件
	 * @param dataFile
	 * @return
	 * @throws Exception
	 */
	public static boolean isUtf8File(File dataFile) throws Exception{
		if(!dataFile.exists()) {
			throw new Exception("file not found");
		}
		byte[] dataBuffer = readFileToByteArr(dataFile);
		return isUtf8(dataBuffer, dataBuffer.length);
	}
	
	/**
	 * 检查是否gbk编码文件
	 * @param dataFile
	 * @return
	 * @throws Exception
	 */
	public static boolean isGBKFile(File dataFile) throws Exception{
		if(!dataFile.exists()) {
			throw new Exception("file not found");
		}
		byte[] dataBuffer = readFileToByteArr(dataFile);
		return isGBK(dataBuffer, dataBuffer.length);
	}
	
	private static byte[] readFileToByteArr(File dataFile) throws IOException {
		FileInputStream fileIS = new FileInputStream(dataFile);
		int estimateLen = (int)(fileIS.available() * 1.2);
		ByteArrayOutputStream arrStream = new ByteArrayOutputStream(estimateLen);
		byte[] buffer = null;
		try {
			int readLen = -1;
			buffer = new byte[estimateLen];
			while((readLen = fileIS.read(buffer)) != -1) {
				arrStream.write(buffer, 0, readLen);
			}
			buffer = arrStream.toByteArray();	
		}catch (Exception e) {
			throw e;
		}finally {
			arrStream.close();
			fileIS.close();	
		}
		return buffer;
	}

	/**
	 * 字符集格式,参见:
	 *  http://jszx.cuit.edu.cn/NewsCont.asp?bm=00&type=1009&id=20575
	 * 参考实现:
	 *  https://www.cnblogs.com/Toney-01-22/p/9935297.html
	 * @param contentByte
	 * @return
	 */
	public static boolean isUtf8(byte[] contentByte, int length) {
		//System.out.println("DATA>>>\n"+Hex.encodeHexString(contentByte));		
		
	    int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节
	    boolean bAllAscii = true;
	    
	    for (int i=0; i<contentByte.length && i<length; ++i) {
	    	int chr = contentByte[i] & 0x0FF;
	    		        
	        //判断是否ASCII编码,如果不是,说明有可能是UTF8,ASCII用7位编码,最高位标记为0,0xxxxxxx 
	        if (nBytes == 0 && (chr & 0x80) != 0) {
	            bAllAscii = false;
	        }
	 
	        if (nBytes == 0) {
	            //如果不是ASCII码,应该是多字节符,计算字节数  
	            if (chr >= 0x80) {
	                if (chr >= 0xFC && chr <= 0xFD) {
	                    nBytes = 6;
	                } else if (chr >= 0xF8) {
	                    nBytes = 5;
	                } else if (chr >= 0xF0) {
	                    nBytes = 4;
	                } else if (chr >= 0xE0) {
	                    nBytes = 3;
	                } else if (chr >= 0xC0) {
	                    nBytes = 2;
	                } else {
	                    return false;
	                }
	                nBytes--;
	            }
	        } else {
	            //多字节符的非首字节,应为 10xxxxxx 
	            if ((chr & 0xC0) != 0x80) {
	                return false;
	            }
	            //减到为零为止
	            nBytes--;
	        }
	    }
	 
	    //违返UTF8编码规则 
	    if (nBytes != 0) {
	        return false;
	    }
	 
	    if (bAllAscii) { //如果全部都是ASCII, 也是UTF8
	        return true;
	    }
	 
	    return true;
	}
	
	/**
	 * 字符集格式,参见:
	 *  https://www.qqxiuzi.cn/zh/hanzi-gbk-bianma.php
	 * 参考实现:
	 *  https://www.cnblogs.com/Toney-01-22/p/9935297.html
	 * @param contentByte
	 * @return
	 */
	public static boolean isGBK(byte[] contentByte, int length) {
	    int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个
	    boolean bAllAscii = true; //如果全部都是ASCII,  
	 
	    for (int i=0; i<contentByte.length && i<length; ++i) {
	    	int chr = contentByte[i] & 0x0FF;
	        if ((chr & 0x80) != 0 && nBytes == 0)
	        {// 判断是否ASCII编码,如果不是,说明有可能是GBK
	            bAllAscii = false;
	        }
	 
	        if (nBytes == 0) 
	        {
	            if (chr >= 0x80) 
	            {
	                if (chr >= 0x81 && chr <= 0xFE)
	                {
	                    nBytes = +2;
	                }
	                else
	                {
	                    return false;
	                }
	                nBytes--;
	            }
	        }
	        else
	        {
	            if (chr < 0x40 || chr>0xFE)
	            {
	                return false;
	            }
	            nBytes--;
	        }//else end
	    }
	 
	    if (nBytes != 0)  
	    {    //违返规则 
	        return false;
	    }
	 
	    if (bAllAscii)
	    { //如果全部都是ASCII, 也是GBK
	        return true;
	    }
	 
	    return true;
	}
}