解析获取Xml Encoding字符集Charset

/*

     * Given a URI string, open it, read its contents into a String

            

     * and return the String

            

*

     *@param uri the URI to open

            

     *@return the content at the URI or null if any error occurs

            

*/

    private String getRDFfromURI (String uri) throws getRDFException

{

        /* add something like this code here, to allow reading from a file:

            

           (if we really want to allow this!)

            

           File ff = new File(uri);

            

           in = new FileInputStream(ff);

            

*/

        URL url = null;

        try {

            url = new URL(uri);

        } catch (MalformedURLException e) {

            throw new getRDFException("Malformed URI.");

}

        URLConnection con = null;

        try {

            con = url.openConnection();

            con.setRequestProperty("Accept", "application/rdf+xml");

            con.connect();

        } catch (Exception e) {

            throw new getRDFException("Unable to open connection.");

}

        String contentT = con.getContentType();

        String HTTPcharset = null;

        if (contentT != null) {

            ContentType contentType = null;

            try {

                contentType = new ContentType(con.getContentType());

            } catch (javax.mail.internet.ParseException e) {

                throw new getRDFException("Unparsable content type.");

}

            HTTPcharset = contentType.getParameter("charset");

}

        // need buffer for lookahead for encoding detection

        BufferedInputStream bis = null;

        try {

            bis = new BufferedInputStream(con.getInputStream());

        } catch (IOException e) {

            throw new getRDFException("Cannot open stream.");

}

        bis.mark(200); // mark start so that we can get back to it

        String s = "";

        try { // read start of file as bytes

            int c;

            int numRead = 0;

            while ((c = bis.read()) != -1) {

                s += (char)c;

                if (numRead++ >= 195) break;

}

        } catch (IOException e) {

            throw new getRDFException("IOException while starting reading.");

}

        if (s.equals(""))

            // Nothing was returned

            throw new getRDFException("Empty document, ignored.");

        // A server could return content but not the RDF/XML that

        // we need.  Check the beginning of s and if it looks like

        // a generic HTML message, return an error.

        if (s.startsWith("<!DOCTYPE"))

            throw new getRDFException("Document looks like HTML, ignored.");

        String APPFcharset = null; // 'charset' according to XML APP. F

        int ignoreBytes = 0;

        if (s.startsWith("\u00FE\u00FF")) {

            APPFcharset = "UTF-16BE";

            ignoreBytes = 2;

}

        else if (s.startsWith("\u00FF\u00FE")) {

            APPFcharset = "UTF-16LE";

            ignoreBytes = 2;

}

        else if (s.startsWith("\u00EF\u00BB\u00BF")) {

            APPFcharset = "UTF-8";

            ignoreBytes = 3;

}

        else if (s.startsWith("\u0000<\u0000?")) {

            APPFcharset = "UTF-16BE";

}

        else if (s.startsWith("<\u0000?\u0000")) {

            APPFcharset = "UTF-16LE";

}

        else if (s.startsWith("<?xml")) {

            APPFcharset = "iso-8859-1"; //to not loose any bytes

}

        else if (s.startsWith("\u004C\u006F\u00A7\u0094")) {

            APPFcharset = "CP037"; // EBCDIC

}

        else {

            APPFcharset = "iso-8859-1"; //to not loose any bytes

}

        // convert start of xml input according to APPFcharset

        String xmlstart = null;

        try {

//          System.err.println("---------------------------");

//          System.err.println("ignoreBytes="+ignoreBytes);

//          System.err.println("s="+s);

//          System.err.println("APPFcharset="+APPFcharset);

//          if (APPFcharset!=null){xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset);}

//          else {xmlstart=new String(s.substring(ignoreBytes).getBytes("iso-8859-1"));APPFcharset = "UTF-8";}

            xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset);

        } catch (UnsupportedEncodingException e) {

            throw new getRDFException("Unsupported encoding '"+APPFcharset+"'.");

}

        RE r;

        try {

            r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3");

        } catch (RESyntaxException res) {

            throw new getRDFException("Wrong regular expression syntax.");

}

        // r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE);

        String XMLcharset = null;

        if (r.match(xmlstart) && r.getParenStart(0)==0)

            XMLcharset = r.getParen(4);

        if (HTTPcharset != null)

            HTTPcharset = HTTPcharset.toUpperCase();

        if (XMLcharset != null)

            XMLcharset = XMLcharset.toUpperCase();

        String finalCharset = null;

        if (HTTPcharset != null) {

            if (XMLcharset != null && !HTTPcharset.equals(XMLcharset))

                throw new getRDFException("Charset conflict: Content-Type: "

                    + contentT+ ". XML encoding: " + XMLcharset + ".");

            finalCharset = HTTPcharset;

}

        else if (XMLcharset != null)

            finalCharset = XMLcharset;

        if ((finalCharset != null && finalCharset.equals("UTF-16")) ||

                (finalCharset == null && APPFcharset.startsWith("UTF-16")))

            if (ignoreBytes == 2)

                finalCharset = APPFcharset; // use correct endianness

            else

                throw new getRDFException("Illegal XML: UTF-16 without BOM.");

        if (finalCharset == null)

            finalCharset = "UTF-8";

        try {

            bis.reset();                 // move back to start of stream

            bis.skip(ignoreBytes);       // skip BOM

        } catch (IOException e) {

            throw new getRDFException("IOException while resetting stream.");

}

        InputStreamReader isr = null;

        try {

            isr = new InputStreamReader(bis, finalCharset);

        } catch (UnsupportedEncodingException e) {

            throw new getRDFException("Unsupported encoding '"+finalCharset+"'.");

}

        StringBuffer sb=new StringBuffer("");

        int bytenum=0;

        try {// read whole file as characters

            int c;

            while ((c = isr.read()) != -1) {

                sb.append((char)c);

                bytenum++;

}

}

        catch (IOException e){

            throw new getRDFException("Undecodable data when reading URI at byte "+bytenum+" using encoding '"+finalCharset+"'."+" Please check encoding and encoding declaration of your document.");

}

        // todo: fix encoding parameter in xml pseudo-PI

        return sb.toString();

}

阅读全文……

标签 : xml

发表评论

IT瘾于2013年9月12日上午10时31分00秒发布 #

发表评论发送引用通报

Re: 解析获取Xml Encoding字符集Charset Anonymous于2025年12月30日下午01时56分36秒评论 #
标题
正文	HTML : b, strong, i, em, blockquote, br, p, pre, a href="", ul, ol, li, sub, sup
OpenID Login	(Not me?)
姓名
电子邮件
网站
记住我	是否
电邮地址不会公开在网页上，您留下的电子邮件仅用于本文有新评论时通知您（以后可以随时拿掉）。

解析获取Xml Encoding字符集Charset

Re: 解析获取Xml Encoding字符集Charset