package org.apache.nutch.parse.java; import japa.parser.ast.Comment; import japa.parser.ast.CompilationUnit; import japa.parser.ast.ImportDeclaration; import japa.parser.ast.PackageDeclaration; import japa.parser.ast.body.TypeDeclaration; import japa.parser.JavaParser; import java.io.*; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.*; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; public class JavaCodeParser implements Parser{ public static final Log LOG = LogFactory.getLog(JavaCodeParser.class); private Configuration conf; public JavaCodeParser () { } public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; } public ParseResult getParse(Content content) { PackageDeclaration pakage; List imports; List types; List comments; CompilationUnit cu; try { byte[] raw = content.getContent(); //parse the inputstream and get a CompilationUnit instance.. cu = JavaParser.parse(new ByteArrayInputStream(raw)); //get all code data pakage = cu.getPackage(); imports = cu.getImports(); types = cu.getTypes(); comments = cu.getComments(); } catch (Exception e) { // run time exception if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("nutch:parse-java:JavaCodeParser Exception: " + e.getMessage()); } return new ParseStatus(ParseStatus.FAILED, "Can't be handled as java document. " + e).getEmptyParseResult(content.getUrl(), getConf()); } StringBuffer indexText = new StringBuffer(); Metadata metadata = content.getMetadata(); indexText.append(pakage.getName()); for (int i = 0; i < imports.size(); i++) { ImportDeclaration id = imports.get(i); indexText.append(id.getName()); indexText.append(" "); metadata.add("import", id.getName().toString()); } for (int i = 0; i < types.size(); i++) { TypeDeclaration td = types.get(i); indexText.append(td.getName()); indexText.append(" "); metadata.add("type", td.getName().toString()); } for (int i = 0; i < comments.size(); i++) { Comment c = comments.get(i); indexText.append(c.getContent().replaceAll("[/*\n]", "")); indexText.append(" "); metadata.add("comment", c.getContent().replaceAll("[/*\n]", "")); } ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,"", OutlinkExtractor.getOutlinks(indexText.toString(), getConf()), content.getMetadata()); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData)); } }