使用Java基于数据流直接抽取ppt文本
- - 脚本爱好者如下代码是直接基于数据流进行文本抽取,支持powerpoint97-powerpoint2003版本,之后的版本实际都是xml,抽取文本非常简单,因此在此处不再说明,代码仅供研究学习使用,禁止用于商业用途.
public class PPTExtractor {
public static StringBuilder logBytes = new StringBuilder();
public static int getPPTDcoument(byte[] ogiBytes, Stream stream, int dirSect1)
{
for(int i=0;i<8;i++)
{
int offsetEntry = (dirSect1 + 1)*512 + i*128;
StringBuilder content = new StringBuilder();
bytesToString(ogiBytes, content, offsetEntry, 64, 0);
if(content.toString().indexOf("PowerPoint Document") > -1)
{
return offsetEntry;
}
}
return 0;
}
public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)
{
byte[] bytes = new byte[length];
System.arraycopy(ogiBytes, start, bytes, 0, length);
if(fc == 0)
{
for(int i=0;i<bytes.length;i++)
{
if(i == bytes.length - 1)
{
return;
}
String hexStr = Integer.toHexString(bytes[i+1] & 0xFF) + Integer.toHexString(bytes[i] & 0xFF);
int ch = Integer.valueOf(hexStr, 16);
content.append( (char)ch );
i++;
}
}
else
{
for(int i=0;i<bytes.length;i++)
{
int ch = bytes[i] & 0xFF;
content.append( (char)ch );
}
}
}
public static void printLogBytes(List<Byte> legaled) throws Exception
{
logBytes = new StringBuilder();
logBytes.append("\n========================================================");
for(int a=0;a<legaled.size();a++)
{
if(a % 16 == 0)
{
logBytes.append("\n");
}
logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");
}
logBytes.append("\n========================================================");
FileUtil.writeAscFile("E:\\bytes.txt", logBytes.toString());
}
public static int findTextRecords(Stream stream, byte[] bytes, int start, StringBuilder content, int[] offset)
{
byte opt = bytes[start];
int container = opt & 0x0f;
if(container == 0x0f)
{
return start+8;
}
offset[0] = start + 2;
int type = stream.getShort(offset);
offset[0] = start + 4;
int len = stream.getInteger(offset);
if(type == 0x0FA8)
{
bytesToString(bytes, content, start+8, len, 1);
System.out.println("Text Bytes Atom found!");
}
if(type == 0x0FA0)
{
bytesToString(bytes, content, start+8, len, 0);
System.out.println("Text Chars Atom found!");
}
int newStart = start + 8 + len;
if(newStart > bytes.length - 8)
{
newStart = -1;
}
return newStart;
}
public static void main(String[] args) throws Exception
{
byte[] ogiBytes = FileUtil.readBinFile("D:\\tools\\oletest\\cn-t.ppt");
System.out.println("Total bytes: "+ ogiBytes.length);
if(
ogiBytes.length < 8 ||
(ogiBytes[0] & 0xFF) != 208 ||
(ogiBytes[1] & 0xFF) != 207 ||
(ogiBytes[2] & 0xFF) != 17 ||
(ogiBytes[3] & 0xFF) != 224 ||
(ogiBytes[4] & 0xFF) != 161 ||
(ogiBytes[5] & 0xFF) != 177 ||
(ogiBytes[6] & 0xFF) != 26 ||
(ogiBytes[7] & 0xFF) != 225
){
System.out.println("Not the ppt file!");
return;
}
Stream stream = new Stream(ogiBytes);
int[] offset = new int[1];
offset[0] = 48;
int dirSect1 = stream.getInteger(offset);
int pptDocument = getPPTDcoument(ogiBytes, stream, dirSect1);
if(pptDocument <= 0)
{
System.out.println("This version of ppt can not be parsed!");
return;
}
offset[0] = pptDocument + 116;
int startSect = stream.getInteger(offset);
int docStart = (startSect + 1)*512;
int docLength = stream.getInteger(offset);
byte[] bytes = new byte[docLength];
System.arraycopy(ogiBytes, docStart, bytes, 0, docLength);
stream = new Stream(bytes);
StringBuilder content = new StringBuilder();
int start = 0;
while(start != -1)
{
start = findTextRecords(stream, bytes, start, content, offset);
}
FileUtil.writeAscFile("E:\\output.txt", content.toString(), false);
System.out.println("Done!");
}
}