ToolsCharset.java
4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package com.taover.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class ToolsCharset {
/**
* 检查是否utf编码文件
* @param dataFile
* @return
* @throws Exception
*/
public static boolean isUtf8File(File dataFile) throws Exception{
if(!dataFile.exists()) {
throw new Exception("file not found");
}
byte[] dataBuffer = readFileToByteArr(dataFile);
return isUtf8(dataBuffer, dataBuffer.length);
}
/**
* 检查是否gbk编码文件
* @param dataFile
* @return
* @throws Exception
*/
public static boolean isGBKFile(File dataFile) throws Exception{
if(!dataFile.exists()) {
throw new Exception("file not found");
}
byte[] dataBuffer = readFileToByteArr(dataFile);
return isGBK(dataBuffer, dataBuffer.length);
}
private static byte[] readFileToByteArr(File dataFile) throws IOException {
FileInputStream fileIS = new FileInputStream(dataFile);
int estimateLen = (int)(fileIS.available() * 1.2);
ByteArrayOutputStream arrStream = new ByteArrayOutputStream(estimateLen);
byte[] buffer = null;
try {
int readLen = -1;
buffer = new byte[estimateLen];
while((readLen = fileIS.read(buffer)) != -1) {
arrStream.write(buffer, 0, readLen);
}
buffer = arrStream.toByteArray();
}catch (Exception e) {
throw e;
}finally {
arrStream.close();
fileIS.close();
}
return buffer;
}
/**
* 字符集格式,参见:
* http://jszx.cuit.edu.cn/NewsCont.asp?bm=00&type=1009&id=20575
* 参考实现:
* https://www.cnblogs.com/Toney-01-22/p/9935297.html
* @param contentByte
* @return
*/
public static boolean isUtf8(byte[] contentByte, int length) {
//System.out.println("DATA>>>\n"+Hex.encodeHexString(contentByte));
int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节
boolean bAllAscii = true;
for (int i=0; i<contentByte.length && i<length; ++i) {
int chr = contentByte[i] & 0x0FF;
//判断是否ASCII编码,如果不是,说明有可能是UTF8,ASCII用7位编码,最高位标记为0,0xxxxxxx
if (nBytes == 0 && (chr & 0x80) != 0) {
bAllAscii = false;
}
if (nBytes == 0) {
//如果不是ASCII码,应该是多字节符,计算字节数
if (chr >= 0x80) {
if (chr >= 0xFC && chr <= 0xFD) {
nBytes = 6;
} else if (chr >= 0xF8) {
nBytes = 5;
} else if (chr >= 0xF0) {
nBytes = 4;
} else if (chr >= 0xE0) {
nBytes = 3;
} else if (chr >= 0xC0) {
nBytes = 2;
} else {
return false;
}
nBytes--;
}
} else {
//多字节符的非首字节,应为 10xxxxxx
if ((chr & 0xC0) != 0x80) {
return false;
}
//减到为零为止
nBytes--;
}
}
//违返UTF8编码规则
if (nBytes != 0) {
return false;
}
if (bAllAscii) { //如果全部都是ASCII, 也是UTF8
return true;
}
return true;
}
/**
* 字符集格式,参见:
* https://www.qqxiuzi.cn/zh/hanzi-gbk-bianma.php
* 参考实现:
* https://www.cnblogs.com/Toney-01-22/p/9935297.html
* @param contentByte
* @return
*/
public static boolean isGBK(byte[] contentByte, int length) {
int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个
boolean bAllAscii = true; //如果全部都是ASCII,
for (int i=0; i<contentByte.length && i<length; ++i) {
int chr = contentByte[i] & 0x0FF;
if ((chr & 0x80) != 0 && nBytes == 0)
{// 判断是否ASCII编码,如果不是,说明有可能是GBK
bAllAscii = false;
}
if (nBytes == 0)
{
if (chr >= 0x80)
{
if (chr >= 0x81 && chr <= 0xFE)
{
nBytes = +2;
}
else
{
return false;
}
nBytes--;
}
}
else
{
if (chr < 0x40 || chr>0xFE)
{
return false;
}
nBytes--;
}//else end
}
if (nBytes != 0)
{ //违返规则
return false;
}
if (bAllAscii)
{ //如果全部都是ASCII, 也是GBK
return true;
}
return true;
}
}