设为首页 收藏本站
查看: 647|回复: 0

[经验分享] Apache Tika源码研究(五)

[复制链接]

尚未签到

发表于 2015-8-1 09:45:54 | 显示全部楼层 |阅读模式
  Apache Tika是怎么识别待解析文档的mime类型的,是怎么根据mime类型得到相应的解析类Parser的,如果我们添加自定义mime类型以及相应的解析类,又该怎么处理呢?前面的文章还没有具体解决这些关键问题
  在tika-core的jar路径org.apache.tika.mime下有一tika-mimetypes.xml文件,里面记录了tika支持的mime类型,文件格式如下

















  因为文件比较长,我这里就只贴出来了一部分
  先熟悉一下tika的mime类型加载及mime类型检测的相关UML模型图
DSC0000.png
  Apache Tika是通过SAX方式来解析该XML文件的,事件处理类MimeTypesReader的源码如下:



class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
private final MimeTypes types;
/** Current type */
private MimeType type = null;
private int priority;
private StringBuilder characters = null;
MimeTypesReader(MimeTypes types) {
this.types = types;
}
void read(InputStream stream) throws IOException, MimeTypeException {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(false);
SAXParser parser = factory.newSAXParser();
parser.parse(stream, this);
} catch (ParserConfigurationException e) {
throw new MimeTypeException("Unable to create an XML parser", e);
} catch (SAXException e) {
throw new MimeTypeException("Invalid type configuration", e);
}
}
void read(Document document) throws MimeTypeException {
try {
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer();
transformer.transform(new DOMSource(document), new SAXResult(this));
} catch (TransformerException e) {
throw new MimeTypeException("Failed to parse type registry", e);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
return new InputSource(new ByteArrayInputStream(new byte[0]));
}
@Override
public void startElement(
String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (type == null) {
if (MIME_TYPE_TAG.equals(qName)) {
String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
try {
type = types.forName(name);
} catch (MimeTypeException e) {
throw new SAXException(e);
}
}
} else if (ALIAS_TAG.equals(qName)) {
String alias = attributes.getValue(ALIAS_TYPE_ATTR);
types.addAlias(type, MediaType.parse(alias));
} else if (SUB_CLASS_OF_TAG.equals(qName)) {
String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
types.setSuperType(type, MediaType.parse(parent));
} else if (COMMENT_TAG.equals(qName)) {
characters = new StringBuilder();
} else if (GLOB_TAG.equals(qName)) {
String pattern = attributes.getValue(PATTERN_ATTR);
String isRegex = attributes.getValue(ISREGEX_ATTR);
if (pattern != null) {
try {
types.addPattern(type, pattern, Boolean.valueOf(isRegex));
} catch (MimeTypeException e) {
throw new SAXException(e);
}
}
} else if (ROOT_XML_TAG.equals(qName)) {
String namespace = attributes.getValue(NS_URI_ATTR);
String name = attributes.getValue(LOCAL_NAME_ATTR);
type.addRootXML(namespace, name);
} else if (MATCH_TAG.equals(qName)) {
String kind = attributes.getValue(MATCH_TYPE_ATTR);
String offset = attributes.getValue(MATCH_OFFSET_ATTR);
String value = attributes.getValue(MATCH_VALUE_ATTR);
String mask = attributes.getValue(MATCH_MASK_ATTR);
if (kind == null) {
kind = "string";
}
current = new ClauseRecord(
new MagicMatch(type.getType(), kind, offset, value, mask));
} else if (MAGIC_TAG.equals(qName)) {
String value = attributes.getValue(MAGIC_PRIORITY_ATTR);
if (value != null && value.length() > 0) {
priority = Integer.parseInt(value);
} else {
priority = 50;
}
current = new ClauseRecord(null);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (type != null) {
if (MIME_TYPE_TAG.equals(qName)) {
type = null;
} else if (COMMENT_TAG.equals(qName)) {
type.setDescription(characters.toString().trim());
characters = null;
} else if (MATCH_TAG.equals(qName)) {
current.stop();
} else if (MAGIC_TAG.equals(qName)) {
for (Clause clause : current.getClauses()) {
type.addMagic(new Magic(type, priority, clause));
}
current = null;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (characters != null) {
characters.append(ch, start, length);
}
}
private ClauseRecord current = new ClauseRecord(null);
private class ClauseRecord {
private ClauseRecord parent;
private Clause clause;
private List subclauses = null;
public ClauseRecord(Clause clause) {
this.parent = current;
this.clause = clause;
}
public void stop() {
if (subclauses != null) {
Clause subclause;
if (subclauses.size() == 1) {
subclause = subclauses.get(0);
} else {
subclause = new OrClause(subclauses);
}
clause = new AndClause(clause, subclause);
}
if (parent.subclauses == null) {
parent.subclauses = Collections.singletonList(clause);
} else {
if (parent.subclauses.size() == 1) {
parent.subclauses = new ArrayList(parent.subclauses);
}
parent.subclauses.add(clause);
}
current = current.parent;
}
public List getClauses() {
return subclauses;
}
}
}
  
   这里的关键方法是void read(InputStream stream),调用SAXParser的parse方法执行事件处理,解析tika-mimetypes.xml文件,初始化MimeTypes types成员变量
  接下来我们来看MimeTypesFactory类的源码:



/**
* Creates instances of MimeTypes.
*/
public class MimeTypesFactory {
/**
* Creates an empty instance; same as calling new MimeTypes().
*
* @return an empty instance
*/
public static MimeTypes create() {
return new MimeTypes();
}
/**
* Creates and returns a MimeTypes instance from the specified document.
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(Document document) throws MimeTypeException {
MimeTypes mimeTypes = new MimeTypes();
new MimeTypesReader(mimeTypes).read(document);
mimeTypes.init();
return mimeTypes;
}
/**
* Creates and returns a MimeTypes instance from the specified input stream.
* Does not close the input stream(s).
* @throws IOException if the stream can not be read
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(InputStream... inputStreams)
throws IOException, MimeTypeException {
MimeTypes mimeTypes = new MimeTypes();
MimeTypesReader reader = new MimeTypesReader(mimeTypes);
for(InputStream inputStream : inputStreams) {
reader.read(inputStream);
}
mimeTypes.init();
return mimeTypes;
}
/** @see #create(InputStream...) */
public static MimeTypes create(InputStream stream)
throws IOException, MimeTypeException {
return create(new InputStream[] { stream });
}
/**
* Creates and returns a MimeTypes instance from the resource
* at the location specified by the URL.  Opens and closes the
* InputStream from the URL.
* If multiple URLs are supplied, then they are loaded in turn.
*
* @throws IOException if the URL can not be accessed
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(URL... urls)
throws IOException, MimeTypeException {
InputStream[] streams = new InputStream[urls.length];
for(int i=0; i

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-92908-1-1.html 上篇帖子: Apache Synapse介绍(译) 下篇帖子: apache+php+mysql在windows下的安装与配置
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表