<< 同时入选IMDB和豆瓣电影100强的经典电影 | 首页 | Handle UTF8 file with BOM - Real's Java How-to >>

解析获取Xml Encoding字符集Charset

    /*
     * Given a URI string, open it, read its contents into a String
     * and return the String
     *
     *@param uri the URI to open
     *@return the content at the URI or null if any error occurs
     */
    private String getRDFfromURI (String uri) throws getRDFException
    {
        /* add something like this code here, to allow reading from a file:
           (if we really want to allow this!)
           File ff = new File(uri);
           in = new FileInputStream(ff);
        */
        URL url = null;
        try {
            url = new URL(uri);
        } catch (MalformedURLException e) {
            throw new getRDFException("Malformed URI.");
        }
 
        URLConnection con = null;
        try {
            con = url.openConnection();
            con.setRequestProperty("Accept", "application/rdf+xml");
            con.connect();
        } catch (Exception e) {
            throw new getRDFException("Unable to open connection.");
        }
        String contentT = con.getContentType();
        String HTTPcharset = null;
        if (contentT != null) {
            ContentType contentType = null;
            try {
                contentType = new ContentType(con.getContentType());
            } catch (javax.mail.internet.ParseException e) {
                throw new getRDFException("Unparsable content type.");
            }
            HTTPcharset = contentType.getParameter("charset");
        }
 
        // need buffer for lookahead for encoding detection
        BufferedInputStream bis = null;
        try {
            bis = new BufferedInputStream(con.getInputStream());
        } catch (IOException e) {
            throw new getRDFException("Cannot open stream.");
        }
        bis.mark(200); // mark start so that we can get back to it
        String s = "";
 
        try { // read start of file as bytes
            int c;
            int numRead = 0;
            while ((c = bis.read()) != -1) {
                s += (char)c;
                if (numRead++ >= 195) break;
            }
        } catch (IOException e) {
            throw new getRDFException("IOException while starting reading.");
        }
 
        if (s.equals(""))
            // Nothing was returned
            throw new getRDFException("Empty document, ignored.");
 
        // A server could return content but not the RDF/XML that
        // we need.  Check the beginning of s and if it looks like
        // a generic HTML message, return an error.
        if (s.startsWith("<!DOCTYPE"))
            throw new getRDFException("Document looks like HTML, ignored.");
 
        String APPFcharset = null; // 'charset' according to XML APP. F
        int ignoreBytes = 0;
        if (s.startsWith("\u00FE\u00FF")) {
            APPFcharset = "UTF-16BE";
            ignoreBytes = 2;
        }
        else if (s.startsWith("\u00FF\u00FE")) {
            APPFcharset = "UTF-16LE";
            ignoreBytes = 2;
        }
        else if (s.startsWith("\u00EF\u00BB\u00BF")) {
            APPFcharset = "UTF-8";
            ignoreBytes = 3;
        }
        else if (s.startsWith("\u0000<\u0000?")) {
            APPFcharset = "UTF-16BE";
        }
        else if (s.startsWith("<\u0000?\u0000")) {
            APPFcharset = "UTF-16LE";
        }
        else if (s.startsWith("<?xml")) {
            APPFcharset = "iso-8859-1"; //to not loose any bytes
        }
        else if (s.startsWith("\u004C\u006F\u00A7\u0094")) {
            APPFcharset = "CP037"; // EBCDIC
        }
        else {
            APPFcharset = "iso-8859-1"; //to not loose any bytes
        }
 
        // convert start of xml input according to APPFcharset
        String xmlstart = null;
        try {
//          System.err.println("---------------------------");
//          System.err.println("ignoreBytes="+ignoreBytes);
//          System.err.println("s="+s);
//          System.err.println("APPFcharset="+APPFcharset);
//          if (APPFcharset!=null){xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset);}
//          else {xmlstart=new String(s.substring(ignoreBytes).getBytes("iso-8859-1"));APPFcharset = "UTF-8";}
            xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset);
        } catch (UnsupportedEncodingException e) {
            throw new getRDFException("Unsupported encoding '"+APPFcharset+"'.");
        }
        RE r;
        try {
            r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3");
        } catch (RESyntaxException res) {
            throw new getRDFException("Wrong regular expression syntax.");
        }
        // r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE);
        String XMLcharset = null;
        if (r.match(xmlstart) && r.getParenStart(0)==0)
            XMLcharset = r.getParen(4);
        if (HTTPcharset != null)
            HTTPcharset = HTTPcharset.toUpperCase();
        if (XMLcharset != null)
            XMLcharset = XMLcharset.toUpperCase();
 
        String finalCharset = null;
        if (HTTPcharset != null) {
            if (XMLcharset != null && !HTTPcharset.equals(XMLcharset))
                throw new getRDFException("Charset conflict: Content-Type: "
                    + contentT+ ". XML encoding: " + XMLcharset + ".");
            finalCharset = HTTPcharset;
        }
        else if (XMLcharset != null)
            finalCharset = XMLcharset;
        if ((finalCharset != null && finalCharset.equals("UTF-16")) ||
                (finalCharset == null && APPFcharset.startsWith("UTF-16")))
            if (ignoreBytes == 2)
                finalCharset = APPFcharset; // use correct endianness
            else
                throw new getRDFException("Illegal XML: UTF-16 without BOM.");
        if (finalCharset == null)
            finalCharset = "UTF-8";
 
        try {
            bis.reset();                 // move back to start of stream
            bis.skip(ignoreBytes);       // skip BOM
        } catch (IOException e) {
            throw new getRDFException("IOException while resetting stream.");
        }
 
        InputStreamReader isr = null;
        try {
            isr = new InputStreamReader(bis, finalCharset);
        } catch (UnsupportedEncodingException e) {
            throw new getRDFException("Unsupported encoding '"+finalCharset+"'.");
        }
        StringBuffer sb=new StringBuffer("");
        int bytenum=0;
        try {// read whole file as characters
            int c;
            while ((c = isr.read()) != -1) {
                sb.append((char)c);
                bytenum++;
            }
        }
        catch (IOException e){
            throw new getRDFException("Undecodable data when reading URI at byte "+bytenum+" using encoding '"+finalCharset+"'."+" Please check encoding and encoding declaration of your document.");
        }
        // todo: fix encoding parameter in xml pseudo-PI
        return sb.toString();
    }

阅读全文……

标签 :



发表评论 发送引用通报