ToolsCharset.java 4.64 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176


package com.taover.util;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

public class ToolsCharset {
	/**
	 * 检查是否utf编码文件
	 * @param dataFile
	 * @return
	 * @throws Exception
	 */
	public static boolean isUtf8File(File dataFile) throws Exception{
		if(!dataFile.exists()) {
			throw new Exception("file not found");
		}
		byte[] dataBuffer = readFileToByteArr(dataFile);
		return isUtf8(dataBuffer, dataBuffer.length);
	}
	
	/**
	 * 检查是否gbk编码文件
	 * @param dataFile
	 * @return
	 * @throws Exception
	 */
	public static boolean isGBKFile(File dataFile) throws Exception{
		if(!dataFile.exists()) {
			throw new Exception("file not found");
		}
		byte[] dataBuffer = readFileToByteArr(dataFile);
		return isGBK(dataBuffer, dataBuffer.length);
	}
	
	private static byte[] readFileToByteArr(File dataFile) throws IOException {
		FileInputStream fileIS = new FileInputStream(dataFile);
		int estimateLen = (int)(fileIS.available() * 1.2);
		ByteArrayOutputStream arrStream = new ByteArrayOutputStream(estimateLen);
		byte[] buffer = null;
		try {
			int readLen = -1;
			buffer = new byte[estimateLen];
			while((readLen = fileIS.read(buffer)) != -1) {
				arrStream.write(buffer, 0, readLen);
			}
			buffer = arrStream.toByteArray();	
		}catch (Exception e) {
			throw e;
		}finally {
			arrStream.close();
			fileIS.close();	
		}
		return buffer;
	}

	/**
	 * 字符集格式，参见：
	 *  http://jszx.cuit.edu.cn/NewsCont.asp?bm=00&type=1009&id=20575
	 * 参考实现：
	 *  https://www.cnblogs.com/Toney-01-22/p/9935297.html
	 * @param contentByte
	 * @return
	 */
	public static boolean isUtf8(byte[] contentByte, int length) {
		//System.out.println("DATA>>>\n"+Hex.encodeHexString(contentByte));		
		
	    int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节
	    boolean bAllAscii = true;
	    
	    for (int i=0; i<contentByte.length && i<length; ++i) {
	    	int chr = contentByte[i] & 0x0FF;
	    		        
	        //判断是否ASCII编码,如果不是,说明有可能是UTF8,ASCII用7位编码,最高位标记为0,0xxxxxxx 
	        if (nBytes == 0 && (chr & 0x80) != 0) {
	            bAllAscii = false;
	        }
	 
	        if (nBytes == 0) {
	            //如果不是ASCII码,应该是多字节符,计算字节数  
	            if (chr >= 0x80) {
	                if (chr >= 0xFC && chr <= 0xFD) {
	                    nBytes = 6;
	                } else if (chr >= 0xF8) {
	                    nBytes = 5;
	                } else if (chr >= 0xF0) {
	                    nBytes = 4;
	                } else if (chr >= 0xE0) {
	                    nBytes = 3;
	                } else if (chr >= 0xC0) {
	                    nBytes = 2;
	                } else {
	                    return false;
	                }
	                nBytes--;
	            }
	        } else {
	            //多字节符的非首字节,应为 10xxxxxx 
	            if ((chr & 0xC0) != 0x80) {
	                return false;
	            }
	            //减到为零为止
	            nBytes--;
	        }
	    }
	 
	    //违返UTF8编码规则 
	    if (nBytes != 0) {
	        return false;
	    }
	 
	    if (bAllAscii) { //如果全部都是ASCII, 也是UTF8
	        return true;
	    }
	 
	    return true;
	}
	
	/**
	 * 字符集格式，参见：
	 *  https://www.qqxiuzi.cn/zh/hanzi-gbk-bianma.php
	 * 参考实现：
	 *  https://www.cnblogs.com/Toney-01-22/p/9935297.html
	 * @param contentByte
	 * @return
	 */
	public static boolean isGBK(byte[] contentByte, int length) {
	    int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个
	    boolean bAllAscii = true; //如果全部都是ASCII,  
	 
	    for (int i=0; i<contentByte.length && i<length; ++i) {
	    	int chr = contentByte[i] & 0x0FF;
	        if ((chr & 0x80) != 0 && nBytes == 0)
	        {// 判断是否ASCII编码,如果不是,说明有可能是GBK
	            bAllAscii = false;
	        }
	 
	        if (nBytes == 0) 
	        {
	            if (chr >= 0x80) 
	            {
	                if (chr >= 0x81 && chr <= 0xFE)
	                {
	                    nBytes = +2;
	                }
	                else
	                {
	                    return false;
	                }
	                nBytes--;
	            }
	        }
	        else
	        {
	            if (chr < 0x40 || chr>0xFE)
	            {
	                return false;
	            }
	            nBytes--;
	        }//else end
	    }
	 
	    if (nBytes != 0)  
	    {    //违返规则 
	        return false;
	    }
	 
	    if (bAllAscii)
	    { //如果全部都是ASCII, 也是GBK
	        return true;
	    }
	 
	    return true;
	}
}