使用Java基于数据流直接抽取excel文本
- - 脚本爱好者如下代码是直接基于数据流进行文本抽取,支持excel97-excel2003版本,之后的版本实际都是xml,抽取文本非常简单,因此在此处不再说明,代码仅供研究学习使用,禁止用于商业用途.
public class ExcelExtractor {
public static StringBuilder logBytes = new StringBuilder();
public static String bytesToString(byte[] ogiBytes, int start, int length, int fc, int[] offset, int fExtSt)
{
StringBuilder content = new StringBuilder();
int streamLength = start+length;
if(fc == 0)
{
for(int i=start;i<streamLength;i++)
{
if(i == streamLength - 1)
{
return content.toString();
}
String a = Integer.toHexString(ogiBytes[i+1] & 0xFF);
String b = Integer.toHexString(ogiBytes[i] & 0xFF);
if(a.length() == 1)
{
a = "0"+ a;
}
if(b.length() == 1)
{
b = "0"+ b;
}
String hexStr = a + b;
int ch = Integer.valueOf(hexStr, 16);
content.append( (char)ch );
i++;
}
}
else
{
for(int i=start;i<streamLength;i++)
{
int ch = ogiBytes[i] & 0xFF;
if(fExtSt == 0)
{
if(ch < 32 || ch > 126 || (ch > 57 && ch < 64))
{
streamLength++;
if(offset != null && offset.length > 0)
{
offset[0]++;
}
continue;
}
}
content.append( (char)ch );
}
}
return content.toString();
}
public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc, int[] offset)
{
content.append( bytesToString(ogiBytes, start, length, fc, offset, 1) );
}
public static void printLogBytes(List<Byte> legaled) throws Exception
{
logBytes = new StringBuilder();
logBytes.append("\n========================================================");
for(int a=0;a<legaled.size();a++)
{
if(a % 16 == 0)
{
logBytes.append("\n");
}
logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");
}
logBytes.append("\n========================================================");
FileUtil.writeAscFile("E:\\bytes.txt", logBytes.toString());
}
public static int getWorkBook(byte[] ogiBytes, Stream stream, int dirSect1)
{
for(int i=0;i<8;i++)
{
int offsetEntry = (dirSect1 + 1)*512 + i*128;
StringBuilder content = new StringBuilder();
bytesToString(ogiBytes, content, offsetEntry, 64, 0, null);
if(content.toString().indexOf("Workbook") > -1)
{
return offsetEntry;
}
}
return 0;
}
public static void processBoundSheet(Stream stream, int lbPlyPos, int docStart, int docLength, StringBuilder content, String[] sst)
{
if(sst == null || sst.length == 0)
{
return;
}
int[] offset = new int[1];
/**
* skip the BOF record
* */
offset[0] = lbPlyPos + 20;
short type = stream.getShort(offset);
if(type != 523)
{
System.out.println("Index record not found!");
return ;
}
int length = stream.getShort(offset);
offset[0] += length;
int streamLength = docStart+docLength;
while(offset[0] < streamLength)
{
type = stream.getShort(offset);
length = stream.getShort(offset);
if(type == 253)
{
offset[0] += 6;
int sstIndex = stream.getInteger(offset);
if(sstIndex < sst.length)
{
content.append( sst[sstIndex] );
}
}
else
{
offset[0] += length;
}
}
}
public static String[] getSST(Stream stream, int docStart, int docLength)
{
int[] offset = new int[1];
offset[0] = docStart;
int streamLength = docStart+docLength;
String[] sst = null;
while(offset[0] < streamLength)
{
short type = stream.getShort(offset);
short length = stream.getShort(offset);
switch(type)
{
case 252:
offset[0] += 4;
int sstLength = stream.getInteger(offset);
sst = new String[sstLength];
for(int i=0;i<sstLength;i++)
{
int cch = stream.getShort(offset);
byte flags = stream.getByte(offset);
int fHightByte = flags & 1;
int fExtSt = flags >> 2;
int fRichSt = flags >> 3;
int rgRun = 0;
int fc = fHightByte != 0 ? 0 : 1;
cch = fc == 0 ? cch*2 : cch;
if(fRichSt != 0)
{
short cRun = stream.getShort(offset);
rgRun = cRun*4;
}
if(fExtSt != 0)
{
offset[0] += 4;
}
sst[i] = bytesToString(stream.getBytes(), offset[0], cch, fc, offset, fExtSt);
if(fExtSt != 0)
{
offset[0] += 14 + 2;
}
offset[0] += (cch + rgRun);
}
break;
default:
offset[0] += length;
break;
}
}
return sst;
}
public static void main(String[] args) throws Exception
{
byte[] ogiBytes = FileUtil.readBinFile("D:\\tools\\oletest\\2.xls");
System.out.println("Total bytes: "+ ogiBytes.length);
if(
ogiBytes.length < 8 ||
(ogiBytes[0] & 0xFF) != 208 ||
(ogiBytes[1] & 0xFF) != 207 ||
(ogiBytes[2] & 0xFF) != 17 ||
(ogiBytes[3] & 0xFF) != 224 ||
(ogiBytes[4] & 0xFF) != 161 ||
(ogiBytes[5] & 0xFF) != 177 ||
(ogiBytes[6] & 0xFF) != 26 ||
(ogiBytes[7] & 0xFF) != 225
){
System.out.println("Not the doc file!");
return;
}
Stream stream = new Stream(ogiBytes);
int[] offset = new int[1];
offset[0] = 48;
int dirSect1 = stream.getInteger(offset);
int workbook = getWorkBook(ogiBytes, stream, dirSect1);
if(workbook <= 0)
{
System.out.println("This version of xls can not be parsed!");
return;
}
offset[0] = workbook + 116;
int startSect = stream.getInteger(offset);
int docStart = (startSect + 1)*512;
int docLength = stream.getInteger(offset);
StringBuilder content = new StringBuilder();
offset[0] = docStart;
int streamLength = docStart+docLength;
String[] sst = getSST(stream, docStart, docLength);
while(offset[0] < streamLength)
{
short type = stream.getShort(offset);
short length = stream.getShort(offset);
switch(type)
{
case 133:
int lbPlyPos = stream.getInteger(offset) + docStart;
System.out.println("BoundSheet found! "+ lbPlyPos);
processBoundSheet(stream, lbPlyPos, docStart, docLength, content, sst);
offset[0] += (length - 4);
break;
default:
offset[0] += length;
break;
}
}
FileUtil.writeAscFile("E:\\output.txt", content.toString(), false);
System.out.println("Done!");
}
}