使用Java基于数据流直接抽取word文本
- - 脚本爱好者如下代码是直接基于数据流进行文本抽取,支持word97-word2003版本,之后的版本实际都是xml,抽取文本非常简单,因此在此处不再说明,代码仅供研究学习使用,禁止用于商业用途.
public class WordExtractor {
public static StringBuilder logBytes = new StringBuilder();
public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)
{
byte[] bytes = new byte[length];
System.arraycopy(ogiBytes, start, bytes, 0, length);
if(fc == 0)
{
for(int i=0;i<bytes.length;i++)
{
if(i == bytes.length - 1)
{
return;
}
String hexStr = Integer.toHexString(bytes[i+1] & 0xFF) + Integer.toHexString(bytes[i] & 0xFF);
int ch = Integer.valueOf(hexStr, 16);
content.append( (char)ch );
i++;
}
}
else
{
for(int i=0;i<bytes.length;i++)
{
int ch = bytes[i] & 0xFF;
content.append( (char)ch );
}
}
}
public static void printLogBytes(List<Byte> legaled) throws Exception
{
logBytes = new StringBuilder();
logBytes.append("
========================================================");
for(int a=0;a<legaled.size();a++)
{
if(a % 16 == 0)
{
logBytes.append("
");
}
logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");
}
logBytes.append("
========================================================");
FileUtil.writeAscFile("E:\bytes.txt", logBytes.toString());
}
public static int getOneTable(byte[] ogiBytes, Stream stream, int dirSect1)
{
for(int i=0;i<8;i++)
{
int offsetEntry = (dirSect1 + 1)*512 + i*128;
StringBuilder content = new StringBuilder();
bytesToString(ogiBytes, content, offsetEntry, 64, 0);
if(content.toString().indexOf("1Table") > -1)
{
return offsetEntry;
}
}
return 0;
}
public static void main(String[] args) throws Exception
{
byte[] ogiBytes = FileUtil.readBinFile("D:\tools\oletest\test-old.doc");
System.out.println("Total bytes: "+ ogiBytes.length);
if(
ogiBytes.length < 8 ||
(ogiBytes[0] & 0xFF) != 208 ||
(ogiBytes[1] & 0xFF) != 207 ||
(ogiBytes[2] & 0xFF) != 17 ||
(ogiBytes[3] & 0xFF) != 224 ||
(ogiBytes[4] & 0xFF) != 161 ||
(ogiBytes[5] & 0xFF) != 177 ||
(ogiBytes[6] & 0xFF) != 26 ||
(ogiBytes[7] & 0xFF) != 225
){
System.out.println("Not the doc file!");
return;
}
StringBuilder content = new StringBuilder();
Stream stream = new Stream(ogiBytes);
int[] offset = new int[1];
offset[0] = 48;
int dirSect1 = stream.getInteger(offset);
int oneTable = getOneTable(ogiBytes, stream, dirSect1);
offset[0] = oneTable + 116;
int startSect = stream.getInteger(offset);
int tableStream = (startSect + 1)*512;
offset[0] = 930;
int fcClx = stream.getInteger(offset);
if(fcClx == -1)
{
System.out.println("This version of doc can not be parsed!");
return;
}
int offsetClx = tableStream + fcClx;
offset[0] = offsetClx + 1;
int lcb = stream.getInteger(offset);
int countPcd = (lcb - 4)/12;
int countCp = (lcb - countPcd*8)/4;
int offsetPlcpcd = offsetClx + 5;
for(int i=0;i<countPcd;i++)
{
int offsetPcd = offsetPlcpcd + countCp*4 + i*8;
offset[0] = offsetPcd + 2;
int start = stream.getInteger(offset);
int fc = start >> 30;
start = (start << 2) >> 2;
offset[0] = offsetPlcpcd + i*4;
int cpPre = stream.getInteger(offset);
int cpNext = stream.getInteger(offset);
int length = cpNext - cpPre -1;
if(fc == 0)
{
length *= 2;
}
else
{
start = start/2;
}
start += 512;
bytesToString(ogiBytes, content, start, length, fc);
System.out.println(start +", "+ length);
}
FileUtil.writeAscFile("E:\output.txt", content.toString(), false);
System.out.println("Done!");
}
}