java中判斷位元組陣列的編碼方式是不是UTF-8
阿新 • • 發佈:2019-01-10
1,用google的工具包,配置maven:
<!-- https://mvnrepository.com/artifact/com.googlecode.juniversalchardet/juniversalchardet --> <dependency> <groupId>com.googlecode.juniversalchardet</groupId> <artifactId>juniversalchardet</artifactId> <version>1.0.3</version> </dependency>
2,定義一個公共方法:
public static String guessEncoding(byte[] bytes) { UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); return encoding; }
public abstract class CharsetUtils { private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { String charset; // charset // 1、encoding in http header Content-Type charset = UrlUtils.getCharset(contentType); if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; } // use default charset to decode first time Charset defaultCharset = Charset.defaultCharset(); String content = new String(contentBytes, defaultCharset); // 2、charset in meta if (StringUtils.isNotEmpty(content)) { Document document = Jsoup.parse(content); Elements links = document.select("meta"); for (Element link : links) { // 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> String metaContent = link.attr("content"); String metaCharset = link.attr("charset"); if (metaContent.indexOf("charset") != -1) { metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); charset = metaContent.split("=")[1]; break; } // 2.2、html5 <meta charset="UTF-8" /> else if (StringUtils.isNotEmpty(metaCharset)) { charset = metaCharset; break; } } } logger.debug("Auto get charset: {}", charset); // 3、todo use tools as cpdetector for content decode charset=guessEncoding(contentBytes); return charset; } }
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } return null; }