/*
|
* Given a URI string, open it, read its contents into a String
|
* and return the String
|
*
|
*@param uri the URI to open
|
*@return the content at the URI or null if any error occurs
|
*/ |
private String getRDFfromURI (String uri) throws getRDFException |
{ |
/* add something like this code here, to allow reading from a file:
|
(if we really want to allow this!)
|
File ff = new File(uri);
|
in = new FileInputStream(ff);
|
*/ |
URL url = null; |
try { |
url = new URL(uri); |
} catch (MalformedURLException e) { |
throw new getRDFException("Malformed URI."); |
} |
|
URLConnection con = null; |
try { |
con = url.openConnection(); |
con.setRequestProperty("Accept", "application/rdf+xml"); |
con.connect(); |
} catch (Exception e) { |
throw new getRDFException("Unable to open connection."); |
} |
String contentT = con.getContentType(); |
String HTTPcharset = null; |
if (contentT != null) { |
ContentType contentType = null; |
try { |
contentType = new ContentType(con.getContentType()); |
} catch (javax.mail.internet.ParseException e) { |
throw new getRDFException("Unparsable content type."); |
} |
HTTPcharset = contentType.getParameter("charset"); |
} |
|
// need buffer for lookahead for encoding detection |
BufferedInputStream bis = null; |
try { |
bis = new BufferedInputStream(con.getInputStream()); |
} catch (IOException e) { |
throw new getRDFException("Cannot open stream."); |
} |
bis.mark(200); // mark start so that we can get back to it |
String s = ""; |
|
try { // read start of file as bytes |
int c; |
int numRead = 0; |
while ((c = bis.read()) != -1) { |
s += (char)c; |
if (numRead++ >= 195) break; |
} |
} catch (IOException e) { |
throw new getRDFException("IOException while starting reading."); |
} |
|
if (s.equals("")) |
// Nothing was returned |
throw new getRDFException("Empty document, ignored."); |
|
// A server could return content but not the RDF/XML that |
// we need. Check the beginning of s and if it looks like |
// a generic HTML message, return an error. |
if (s.startsWith("<!DOCTYPE")) |
throw new getRDFException("Document looks like HTML, ignored."); |
|
String APPFcharset = null; // 'charset' according to XML APP. F |
int ignoreBytes = 0; |
if (s.startsWith("\u00FE\u00FF")) { |
APPFcharset = "UTF-16BE"; |
ignoreBytes = 2; |
} |
else if (s.startsWith("\u00FF\u00FE")) { |
APPFcharset = "UTF-16LE"; |
ignoreBytes = 2; |
} |
else if (s.startsWith("\u00EF\u00BB\u00BF")) { |
APPFcharset = "UTF-8"; |
ignoreBytes = 3; |
} |
else if (s.startsWith("\u0000<\u0000?")) { |
APPFcharset = "UTF-16BE"; |
} |
else if (s.startsWith("<\u0000?\u0000")) { |
APPFcharset = "UTF-16LE"; |
} |
else if (s.startsWith("<?xml")) { |
APPFcharset = "iso-8859-1"; //to not loose any bytes |
} |
else if (s.startsWith("\u004C\u006F\u00A7\u0094")) { |
APPFcharset = "CP037"; // EBCDIC |
} |
else { |
APPFcharset = "iso-8859-1"; //to not loose any bytes |
} |
|
// convert start of xml input according to APPFcharset |
String xmlstart = null; |
try { |
// System.err.println("---------------------------"); |
// System.err.println("ignoreBytes="+ignoreBytes); |
// System.err.println("s="+s); |
// System.err.println("APPFcharset="+APPFcharset); |
// if (APPFcharset!=null){xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset);} |
// else {xmlstart=new String(s.substring(ignoreBytes).getBytes("iso-8859-1"));APPFcharset = "UTF-8";} |
xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1"), APPFcharset); |
} catch (UnsupportedEncodingException e) { |
throw new getRDFException("Unsupported encoding '"+APPFcharset+"'."); |
} |
RE r; |
try { |
r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[ \\t\\n\\r]?(['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3"); |
} catch (RESyntaxException res) { |
throw new getRDFException("Wrong regular expression syntax."); |
} |
// r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE); |
String XMLcharset = null; |
if (r.match(xmlstart) && r.getParenStart(0)==0) |
XMLcharset = r.getParen(4); |
if (HTTPcharset != null) |
HTTPcharset = HTTPcharset.toUpperCase(); |
if (XMLcharset != null) |
XMLcharset = XMLcharset.toUpperCase(); |
|
String finalCharset = null; |
if (HTTPcharset != null) { |
if (XMLcharset != null && !HTTPcharset.equals(XMLcharset)) |
throw new getRDFException("Charset conflict: Content-Type: " |
+ contentT+ ". XML encoding: " + XMLcharset + "."); |
finalCharset = HTTPcharset; |
} |
else if (XMLcharset != null) |
finalCharset = XMLcharset; |
if ((finalCharset != null && finalCharset.equals("UTF-16")) || |
(finalCharset == null && APPFcharset.startsWith("UTF-16"))) |
if (ignoreBytes == 2) |
finalCharset = APPFcharset; // use correct endianness |
else |
throw new getRDFException("Illegal XML: UTF-16 without BOM."); |
if (finalCharset == null) |
finalCharset = "UTF-8"; |
|
try { |
bis.reset(); // move back to start of stream |
bis.skip(ignoreBytes); // skip BOM |
} catch (IOException e) { |
throw new getRDFException("IOException while resetting stream."); |
} |
|
InputStreamReader isr = null; |
try { |
isr = new InputStreamReader(bis, finalCharset); |
} catch (UnsupportedEncodingException e) { |
throw new getRDFException("Unsupported encoding '"+finalCharset+"'."); |
} |
StringBuffer sb=new StringBuffer(""); |
int bytenum=0; |
try {// read whole file as characters |
int c; |
while ((c = isr.read()) != -1) { |
sb.append((char)c); |
bytenum++; |
} |
} |
catch (IOException e){ |
throw new getRDFException("Undecodable data when reading URI at byte "+bytenum+" using encoding '"+finalCharset+"'."+" Please check encoding and encoding declaration of your document."); |
} |
// todo: fix encoding parameter in xml pseudo-PI |
return sb.toString(); |
} |