Apache Tika源码研究(五)
Apache Tika是怎么识别待解析文档的mime类型的,是怎么根据mime类型得到相应的解析类Parser的,如果我们添加自定义mime类型以及相应的解析类,又该怎么处理呢?前面的文章还没有具体解决这些关键问题在tika-core的jar路径org.apache.tika.mime下有一tika-mimetypes.xml文件,里面记录了tika支持的mime类型,文件格式如下
因为文件比较长,我这里就只贴出来了一部分
先熟悉一下tika的mime类型加载及mime类型检测的相关UML模型图
Apache Tika是通过SAX方式来解析该XML文件的,事件处理类MimeTypesReader的源码如下:
class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
private final MimeTypes types;
/** Current type */
private MimeType type = null;
private int priority;
private StringBuilder characters = null;
MimeTypesReader(MimeTypes types) {
this.types = types;
}
void read(InputStream stream) throws IOException, MimeTypeException {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(false);
SAXParser parser = factory.newSAXParser();
parser.parse(stream, this);
} catch (ParserConfigurationException e) {
throw new MimeTypeException("Unable to create an XML parser", e);
} catch (SAXException e) {
throw new MimeTypeException("Invalid type configuration", e);
}
}
void read(Document document) throws MimeTypeException {
try {
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer();
transformer.transform(new DOMSource(document), new SAXResult(this));
} catch (TransformerException e) {
throw new MimeTypeException("Failed to parse type registry", e);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
return new InputSource(new ByteArrayInputStream(new byte));
}
@Override
public void startElement(
String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (type == null) {
if (MIME_TYPE_TAG.equals(qName)) {
String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
try {
type = types.forName(name);
} catch (MimeTypeException e) {
throw new SAXException(e);
}
}
} else if (ALIAS_TAG.equals(qName)) {
String alias = attributes.getValue(ALIAS_TYPE_ATTR);
types.addAlias(type, MediaType.parse(alias));
} else if (SUB_CLASS_OF_TAG.equals(qName)) {
String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
types.setSuperType(type, MediaType.parse(parent));
} else if (COMMENT_TAG.equals(qName)) {
characters = new StringBuilder();
} else if (GLOB_TAG.equals(qName)) {
String pattern = attributes.getValue(PATTERN_ATTR);
String isRegex = attributes.getValue(ISREGEX_ATTR);
if (pattern != null) {
try {
types.addPattern(type, pattern, Boolean.valueOf(isRegex));
} catch (MimeTypeException e) {
throw new SAXException(e);
}
}
} else if (ROOT_XML_TAG.equals(qName)) {
String namespace = attributes.getValue(NS_URI_ATTR);
String name = attributes.getValue(LOCAL_NAME_ATTR);
type.addRootXML(namespace, name);
} else if (MATCH_TAG.equals(qName)) {
String kind = attributes.getValue(MATCH_TYPE_ATTR);
String offset = attributes.getValue(MATCH_OFFSET_ATTR);
String value = attributes.getValue(MATCH_VALUE_ATTR);
String mask = attributes.getValue(MATCH_MASK_ATTR);
if (kind == null) {
kind = "string";
}
current = new ClauseRecord(
new MagicMatch(type.getType(), kind, offset, value, mask));
} else if (MAGIC_TAG.equals(qName)) {
String value = attributes.getValue(MAGIC_PRIORITY_ATTR);
if (value != null && value.length() > 0) {
priority = Integer.parseInt(value);
} else {
priority = 50;
}
current = new ClauseRecord(null);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (type != null) {
if (MIME_TYPE_TAG.equals(qName)) {
type = null;
} else if (COMMENT_TAG.equals(qName)) {
type.setDescription(characters.toString().trim());
characters = null;
} else if (MATCH_TAG.equals(qName)) {
current.stop();
} else if (MAGIC_TAG.equals(qName)) {
for (Clause clause : current.getClauses()) {
type.addMagic(new Magic(type, priority, clause));
}
current = null;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (characters != null) {
characters.append(ch, start, length);
}
}
private ClauseRecord current = new ClauseRecord(null);
private class ClauseRecord {
private ClauseRecord parent;
private Clause clause;
private List subclauses = null;
public ClauseRecord(Clause clause) {
this.parent = current;
this.clause = clause;
}
public void stop() {
if (subclauses != null) {
Clause subclause;
if (subclauses.size() == 1) {
subclause = subclauses.get(0);
} else {
subclause = new OrClause(subclauses);
}
clause = new AndClause(clause, subclause);
}
if (parent.subclauses == null) {
parent.subclauses = Collections.singletonList(clause);
} else {
if (parent.subclauses.size() == 1) {
parent.subclauses = new ArrayList(parent.subclauses);
}
parent.subclauses.add(clause);
}
current = current.parent;
}
public List getClauses() {
return subclauses;
}
}
}
这里的关键方法是void read(InputStream stream),调用SAXParser的parse方法执行事件处理,解析tika-mimetypes.xml文件,初始化MimeTypes types成员变量
接下来我们来看MimeTypesFactory类的源码:
/**
* Creates instances of MimeTypes.
*/
public class MimeTypesFactory {
/**
* Creates an empty instance; same as calling new MimeTypes().
*
* @return an empty instance
*/
public static MimeTypes create() {
return new MimeTypes();
}
/**
* Creates and returns a MimeTypes instance from the specified document.
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(Document document) throws MimeTypeException {
MimeTypes mimeTypes = new MimeTypes();
new MimeTypesReader(mimeTypes).read(document);
mimeTypes.init();
return mimeTypes;
}
/**
* Creates and returns a MimeTypes instance from the specified input stream.
* Does not close the input stream(s).
* @throws IOException if the stream can not be read
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(InputStream... inputStreams)
throws IOException, MimeTypeException {
MimeTypes mimeTypes = new MimeTypes();
MimeTypesReader reader = new MimeTypesReader(mimeTypes);
for(InputStream inputStream : inputStreams) {
reader.read(inputStream);
}
mimeTypes.init();
return mimeTypes;
}
/** @see #create(InputStream...) */
public static MimeTypes create(InputStream stream)
throws IOException, MimeTypeException {
return create(new InputStream[] { stream });
}
/**
* Creates and returns a MimeTypes instance from the resource
* at the location specified by the URL.Opens and closes the
* InputStream from the URL.
* If multiple URLs are supplied, then they are loaded in turn.
*
* @throws IOException if the URL can not be accessed
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(URL... urls)
throws IOException, MimeTypeException {
InputStream[] streams = new InputStream;
for(int i=0; i
页:
[1]