/*
|
* Given a URI string, open it, read its contents into a String
|
* and return the String
|
*
|
*@param uri the URI to open
|
*@return the content at the URI or null if any error occurs
|
| */ |
| private String getRDFfromURI (String uri) throws getRDFException |
| { |
/* add something like this code here, to allow reading from a file:
|
(if we really want to allow this!)
|
File ff = new File(uri);
|
in = new FileInputStream(ff);
|
| */ |
| URL url = null; |
| try { |
| url = new URL(uri); |
| } catch (MalformedURLException e) { |
| throw new getRDFException("Malformed URI."); |
| } |
| |
| URLConnection con = null; |
| try { |
| con = url.openConnection(); |
| con.setRequestProperty("Accept", "application/rdf+xml"); |
| con.connect(); |
| } catch (Exception e) { |
| throw new getRDFException("Unable to open connection."); |
| } |
| String contentT = con.getContentType(); |
| String HTTPcharset = null; |
| if (contentT != null) { |
| ContentType contentType = null; |
| try { |
| contentType = new ContentType(con.getContentType()); |
| } catch (javax.mail.internet.ParseException e) { |
| throw new getRDFException("Unparsable content type."); |
| } |
| HTTPcharset = contentType.getParameter("charset"); |
| } |
| |
| // need buffer for lookahead for encoding detection |
| BufferedInputStream bis = null; |
| try { |
| bis = new BufferedInputStream(con.getInputStream()); |
| } catch (IOException e) { |
| throw new getRDFException("Cannot open stream."); |
| } |
| bis.mark(200); // mark start so that we can get back to it |
| String s = ""; |
| |
| try { // read start of file as bytes |
| int c; |
| int numRead = 0; |
| while ((c = bis.read()) != -1) { |
| s += (char)c; |
| if (numRead++ >= 195) break; |
| } |
| } catch (IOException e) { |
| throw new getRDFException("IOException while starting reading."); |
| } |
| |
| if (s.equals("")) |
| // Nothing was returned |
| throw new getRDFException("Empty document, ignored."); |
| |
| // A server could return content but not the RDF/XML that |
| // we need. Check the beginning of s and if it looks like |
| // a generic HTML message, return an error. |
| if (s.startsWith("<!DOCTYPE")) |
| throw new getRDFException("Document looks like HTML, ignored."); |
| |
| String APPFcharset = null; // 'charset' according to XML APP. F |
| int ignoreBytes = 0; |
| if (s.startsWith("\u00FE\u00FF")) { |
| APPFcharset = "UTF-16BE"; |
| ignoreBytes = 2; |
| } |
| else if (s.startsWith("\u00FF\u00FE")) { |
| APPFcharset = "UTF-16LE"; |
| ignoreBytes = 2; |
| } |
| else if (s.startsWith("\u00EF\u00BB\u00BF")) { |
| APPFcharset = "UTF-8"; |
| ignoreBytes = 3; |
| } |
| else if (s.startsWith("\u0000<\u0000?")) { |
| APPFcharset = "UTF-16BE"; |
| } |
| else if (s.startsWith("<\u0000?\u0000")) { |
| APPFcharset = "UTF-16LE"; |
| } |
| else if (s.startsWith("<?xml")) { |
| APPFcharset = "iso-8859-1"; //to not loose any bytes |
| } |
| else if (s.startsWith("\u004C\u006F\u00A7\u0094")) { |
| APPFcharset = "CP037"; // EBCDIC |
| } |
| else { |
| APPFcharset = "iso-8859-1"; //to not loose any bytes |
| } |
| |
| // convert start of xml input according to APPFcharset |
| String xmlstart = null; |
| try { |
| // System.err.println("---------------------------"); |
| // System.err.println("ignoreBytes="+ignoreBytes); |
| // System.err.println("s="+s); |
| // System.err.println("APPFcharset="+APPFcharset); |
| // if (APPFcharset!=null){xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset);} |
| // else {xmlstart=new String(s.substring(ignoreBytes).getBytes("iso-8859-1"));APPFcharset = "UTF-8";} |
| xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset); |
| } catch (UnsupportedEncodingException e) { |
| throw new getRDFException("Unsupported encoding '"+APPFcharset+"'."); |
| } |
| RE r; |
| try { |
| r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3"); |
| } catch (RESyntaxException res) { |
| throw new getRDFException("Wrong regular expression syntax."); |
| } |
| // r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE); |
| String XMLcharset = null; |
| if (r.match(xmlstart) && r.getParenStart(0)==0) |
| XMLcharset = r.getParen(4); |
| if (HTTPcharset != null) |
| HTTPcharset = HTTPcharset.toUpperCase(); |
| if (XMLcharset != null) |
| XMLcharset = XMLcharset.toUpperCase(); |
| |
| String finalCharset = null; |
| if (HTTPcharset != null) { |
| if (XMLcharset != null && !HTTPcharset.equals(XMLcharset)) |
| throw new getRDFException("Charset conflict: Content-Type: " |
| + contentT+ ". XML encoding: " + XMLcharset + "."); |
| finalCharset = HTTPcharset; |
| } |
| else if (XMLcharset != null) |
| finalCharset = XMLcharset; |
| if ((finalCharset != null && finalCharset.equals("UTF-16")) || |
| (finalCharset == null && APPFcharset.startsWith("UTF-16"))) |
| if (ignoreBytes == 2) |
| finalCharset = APPFcharset; // use correct endianness |
| else |
| throw new getRDFException("Illegal XML: UTF-16 without BOM."); |
| if (finalCharset == null) |
| finalCharset = "UTF-8"; |
| |
| try { |
| bis.reset(); // move back to start of stream |
| bis.skip(ignoreBytes); // skip BOM |
| } catch (IOException e) { |
| throw new getRDFException("IOException while resetting stream."); |
| } |
| |
| InputStreamReader isr = null; |
| try { |
| isr = new InputStreamReader(bis, finalCharset); |
| } catch (UnsupportedEncodingException e) { |
| throw new getRDFException("Unsupported encoding '"+finalCharset+"'."); |
| } |
| StringBuffer sb=new StringBuffer(""); |
| int bytenum=0; |
| try {// read whole file as characters |
| int c; |
| while ((c = isr.read()) != -1) { |
| sb.append((char)c); |
| bytenum++; |
| } |
| } |
| catch (IOException e){ |
| throw new getRDFException("Undecodable data when reading URI at byte "+bytenum+" using encoding '"+finalCharset+"'."+" Please check encoding and encoding declaration of your document."); |
| } |
| // todo: fix encoding parameter in xml pseudo-PI |
| return sb.toString(); |
| } |