218 lines
6.9 KiB
Java
218 lines
6.9 KiB
Java
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
package org.apache.jasper.compiler;
|
|
|
|
import java.io.BufferedInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
|
|
import javax.xml.stream.XMLInputFactory;
|
|
import javax.xml.stream.XMLStreamException;
|
|
import javax.xml.stream.XMLStreamReader;
|
|
|
|
/*
|
|
* The BoM detection is derived from:
|
|
* https://svn.us.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/xmlparser/XMLEncodingDetector.java?annotate=1742248
|
|
*
|
|
* The prolog is always at least as specific as the BOM therefore any encoding
|
|
* specified in the prolog should take priority over the BOM.
|
|
*/
|
|
class EncodingDetector {
|
|
|
|
private static final XMLInputFactory XML_INPUT_FACTORY;
|
|
static {
|
|
XML_INPUT_FACTORY = XMLInputFactory.newInstance();
|
|
}
|
|
|
|
private final String encoding;
|
|
private final int skip;
|
|
private final boolean encodingSpecifiedInProlog;
|
|
|
|
|
|
/*
|
|
* TODO: Refactor Jasper InputStream creation and handling so the
|
|
* InputStream passed to this method is buffered and therefore saves
|
|
* on multiple opening and re-opening of the same file.
|
|
*/
|
|
EncodingDetector(InputStream is) throws IOException {
|
|
// Keep buffer size to a minimum here. BoM will be no more than 4 bytes
|
|
// so that is the maximum we need to buffer
|
|
BufferedInputStream bis = new BufferedInputStream(is, 4);
|
|
bis.mark(4);
|
|
|
|
BomResult bomResult = processBom(bis);
|
|
|
|
// Reset the stream back to the start to allow the XML prolog detection
|
|
// to work. Skip any BoM we discovered.
|
|
bis.reset();
|
|
for (int i = 0; i < bomResult.skip; i++) {
|
|
bis.read();
|
|
}
|
|
|
|
String prologEncoding = getPrologEncoding(bis);
|
|
if (prologEncoding == null) {
|
|
encodingSpecifiedInProlog = false;
|
|
encoding = bomResult.encoding;
|
|
} else {
|
|
encodingSpecifiedInProlog = true;
|
|
encoding = prologEncoding;
|
|
}
|
|
skip = bomResult.skip;
|
|
}
|
|
|
|
|
|
String getEncoding() {
|
|
return encoding;
|
|
}
|
|
|
|
|
|
int getSkip() {
|
|
return skip;
|
|
}
|
|
|
|
|
|
boolean isEncodingSpecifiedInProlog() {
|
|
return encodingSpecifiedInProlog;
|
|
}
|
|
|
|
|
|
private String getPrologEncoding(InputStream stream) {
|
|
String encoding = null;
|
|
try {
|
|
XMLStreamReader xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(stream);
|
|
encoding = xmlStreamReader.getCharacterEncodingScheme();
|
|
} catch (XMLStreamException e) {
|
|
// Ignore
|
|
}
|
|
return encoding;
|
|
}
|
|
|
|
|
|
private BomResult processBom(InputStream stream) {
|
|
// Read first four bytes (or as many are available) and determine
|
|
// encoding
|
|
try {
|
|
final byte[] b4 = new byte[4];
|
|
int count = 0;
|
|
int singleByteRead;
|
|
while (count < 4) {
|
|
singleByteRead = stream.read();
|
|
if (singleByteRead == -1) {
|
|
break;
|
|
}
|
|
b4[count] = (byte) singleByteRead;
|
|
count++;
|
|
}
|
|
|
|
return parseBom(b4, count);
|
|
} catch (IOException ioe) {
|
|
// Failed.
|
|
return new BomResult("UTF-8", 0);
|
|
}
|
|
}
|
|
|
|
|
|
private BomResult parseBom(byte[] b4, int count) {
|
|
|
|
if (count < 2) {
|
|
return new BomResult("UTF-8", 0);
|
|
}
|
|
|
|
// UTF-16, with BOM
|
|
int b0 = b4[0] & 0xFF;
|
|
int b1 = b4[1] & 0xFF;
|
|
if (b0 == 0xFE && b1 == 0xFF) {
|
|
// UTF-16, big-endian
|
|
return new BomResult("UTF-16BE", 2);
|
|
}
|
|
if (b0 == 0xFF && b1 == 0xFE) {
|
|
// UTF-16, little-endian
|
|
return new BomResult("UTF-16LE", 2);
|
|
}
|
|
|
|
// default to UTF-8 if we don't have enough bytes to make a
|
|
// good determination of the encoding
|
|
if (count < 3) {
|
|
return new BomResult("UTF-8", 0);
|
|
}
|
|
|
|
// UTF-8 with a BOM
|
|
int b2 = b4[2] & 0xFF;
|
|
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
|
|
return new BomResult("UTF-8", 3);
|
|
}
|
|
|
|
// default to UTF-8 if we don't have enough bytes to make a
|
|
// good determination of the encoding
|
|
if (count < 4) {
|
|
return new BomResult("UTF-8", 0);
|
|
}
|
|
|
|
// Other encodings. No BOM. Try and ID encoding.
|
|
int b3 = b4[3] & 0xFF;
|
|
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
|
|
// UCS-4, big endian (1234)
|
|
return new BomResult("ISO-10646-UCS-4", 0);
|
|
}
|
|
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
|
|
// UCS-4, little endian (4321)
|
|
return new BomResult("ISO-10646-UCS-4", 0);
|
|
}
|
|
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
|
|
// UCS-4, unusual octet order (2143)
|
|
// REVISIT: What should this be?
|
|
return new BomResult("ISO-10646-UCS-4", 0);
|
|
}
|
|
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
|
|
// UCS-4, unusual octet order (3412)
|
|
// REVISIT: What should this be?
|
|
return new BomResult("ISO-10646-UCS-4", 0);
|
|
}
|
|
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
|
|
// UTF-16, big-endian, no BOM
|
|
// (or could turn out to be UCS-2...
|
|
// REVISIT: What should this be?
|
|
return new BomResult("UTF-16BE", 0);
|
|
}
|
|
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
|
|
// UTF-16, little-endian, no BOM
|
|
// (or could turn out to be UCS-2...
|
|
return new BomResult("UTF-16LE", 0);
|
|
}
|
|
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
|
|
// EBCDIC
|
|
// a la xerces1, return CP037 instead of EBCDIC here
|
|
return new BomResult("CP037", 0);
|
|
}
|
|
|
|
// default encoding
|
|
return new BomResult("UTF-8", 0);
|
|
}
|
|
|
|
|
|
private static class BomResult {
|
|
|
|
public final String encoding;
|
|
public final int skip;
|
|
|
|
public BomResult(String encoding, int skip) {
|
|
this.encoding = encoding;
|
|
this.skip = skip;
|
|
}
|
|
}
|
|
}
|