amachangのXPath機能テストで、JavaのXPathAPIのテストをしてみた。

本当は、HTMLを取得して、解析しようと思ったが、xhtmlじゃないので、DocumentBuilderでの解析に失敗するので、対処するのが面倒なのと、HttpClientとか使うと依存が増えるので、とりあえず、該当箇所だけ引っぱり出してテストしてみた。

package xpath;

import com.sun.org.apache.xpath.internal.XPathAPI;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import junit.framework.TestCase;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.SAXException;

public class XPathCoverageTest extends TestCase{
    
    public void test_checkCoverage() {
        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true);
            DocumentBuilder builder = factory.newDocumentBuilder();
            // 解析対象のHTMLの読み込み
            Document doc = builder.parse(getClass().getResourceAsStream("test.html"));
            
            //テスト対象のケース取得
            Map<String, List<String>> testData = readData("testData.txt");
            for (String key : testData.keySet()) {
                long start = System.currentTimeMillis();
                // NodeIteratorの取得
                NodeIterator nl = XPathAPI.selectNodeIterator(doc, key);
                long processTime = System.currentTimeMillis() - start;
                Node node;
                List<String> nodeList = new LinkedList<String>();
                while ((node = nl.nextNode()) != null) {
                    nodeList.add(node.getNodeName());
                }
                
                boolean isValid = testData.get(key).equals(nodeList);
                if (isValid) {
                    System.out.println("[ok]" + key + " " + processTime + "ms");
                } else {
                    System.out.println("[ng]" + key + " " + processTime + "ms");
                    System.out.println("expected : " + testData.get(key)
                    + ", value : " + nodeList.toString());
                }
            }
        } catch (SAXException e) {
        } catch (IOException e) {
        } catch (ParserConfigurationException e) {
        } catch (TransformerException e) {
        }
    }
    
    /**左側の空白文字削除用パターン*/
    private final Pattern LEFT_TRIM_PATTERN = Pattern.compile("^[\\s ]+");
    
    /**右側の文字削除用パターン*/
    private final Pattern RIGHT_TRIM_PATTERN = Pattern.compile("[\\s ,]+$");
    
    /**パラメーター解析用パターン*/
    private final Pattern PARAMTER_PATTERN = Pattern.compile("^\\['(.+)',\\s'(.+)'\\]");
    
    private String[] parse(String params) {
        params = LEFT_TRIM_PATTERN.matcher(
                RIGHT_TRIM_PATTERN.matcher(params).replaceAll("")).replaceAll(
                "");
        List<String> paramList = new LinkedList<String>();
        Matcher matcher = PARAMTER_PATTERN.matcher(params);
        if (matcher.matches()) {
            for (int i = 0; i < matcher.groupCount(); i++) {
                paramList.add(matcher.group(i + 1));
            }
        }
        return (String[]) paramList.toArray(new String[paramList.size()]);
    }
    
    /**
     * テスト用データ読み込み
     * @param dataFileName
     * @return
     */
    private Map<String, List<String>> readData(String dataFileName) {
        InputStream input = null;
        Map<String, List<String>> data = new LinkedHashMap<String, List<String>>();
        try {
            input = getClass().getResourceAsStream(dataFileName);
            BufferedReader reader = new BufferedReader(new InputStreamReader(
                    input));
            while (reader.ready()) {
                String[] params = null;
                if ((params = parse(reader.readLine())).length == 2) {
                    data.put(params[0], Arrays.asList(params[1].split(" ")));
                }
            }
        } catch (IOException e) {
        } finally {
            if (input != null) {
                try {
                    input.close();
                } catch (IOException e) {
                }
            }
        }
        return data;
    }
    
}

テスト対象のデータ(testData.txt)。http://amachang.art-code.org/xpath_functional_test/から、機能テスト用のデータ部分だけ抽出。

        ['.//blockquote/*', 'br p font'],
        ['.//blockquote/child::*', 'br p font'],
        ['.//blockquote/parent::*', 'center'],
        ['.//blockquote/descendant::*', 'br p del ins font'],
        ['.//blockquote/descendant-or-self::*', 'blockquote br p del ins font'],
        ['.//blockquote/ancestor::*', 'html body div center'],
        ['.//blockquote/ancestor-or-self::*', 'html body div center blockquote'],
        ['.//blockquote/following-sibling::*', 'h3 h4'],
        ['.//blockquote/preceding-sibling::*', 'h1 h2'],
        ['.//blockquote/following::*', 'h3 dfn a h4 sub sup span abbr q'],
        ['.//blockquote/preceding::*', 'head title script dl dt dd h1 em strong h2 b s'],
        ['.//blockquote/self::*', 'blockquote'],
        ['.//blockquote/attribute::id/parent::*', 'blockquote'],
        ['.//blockquote/@id/parent::*', 'blockquote'],


        ['.//*[blockquote]', 'center'],
        ['.//*[child::blockquote]', 'center'],
        ['.//*[parent::blockquote]', 'br p font'],
        ['.//*[descendant::blockquote]', 'div center'],
        ['.//*[descendant-or-self::blockquote]', 'div center blockquote'],
        ['.//*[ancestor::blockquote]', 'br p del ins font'],
        ['.//*[ancestor-or-self::blockquote]', 'blockquote br p del ins font'],
        ['.//*[following-sibling::blockquote]', 'h1 h2'],
        ['.//*[preceding-sibling::blockquote]', 'h3 h4'],
        ['.//*[following::blockquote]', 'dl dt dd h1 em strong h2 b s'],
        ['.//*[preceding::blockquote]', 'h3 dfn a h4 sub sup span abbr q'],
        ['.//*[self::blockquote]', 'blockquote'],
        ['.//*[@id]', 'div dl dt dd center h1 em strong h2 b s blockquote br p del ins font h3 dfn a h4 sub sup span abbr q'],
        ['.//*[attribute::id]', 'div dl dt dd center h1 em strong h2 b s blockquote br p del ins font h3 dfn a h4 sub sup span abbr q'],


        ['.//blockquote/text()', 't:blockquoteText1: t:blockquoteText2'],
        ['.//blockquote/comment()', 'c:blockquoteComment'],
        ['.//blockquote/processing-instruction()', 'p:pi'],
        ['.//blockquote/processing-instruction("pi")', 'p:pi'],
        ['.//blockquote/node()', 'c:blockquoteComment t:blockquoteText1: br t:blockquoteText2 p p:pi font'],
        ['.//blockquote/p', 'p'],
        ['.//blockquote/*', 'br p font'],


        ['.//*[child::* and preceding::font]', 'h3 h4 span'],
        ['.//*[not(child::*) and preceding::font]', 'dfn a sub sup abbr q'],
        ['.//*[preceding::blockquote or following::blockquote]', 
                            'dl dt dd h1 em strong h2 b s h3 dfn a h4 sub sup span abbr q'],
        ['.//blockquote/ancestor::* | .//blockquote/descendant::*', 'html body div center br p del ins font'],
        ['.//*[.="sub"]', 'sub'],
        ['.//*[@title > 12 and @class < 15]', 'br p del ins font'], 
        ['.//*[@title != @class]',
                'div dl dt dd center em strong b s blockquote br p del ins font dfn a sub sup span abbr q'], 
        ['.//*[((@class * @class + @title * @title) div (@class + @title)) > ((@class - @title) * (@class - @title))]',
                'dl h1 h2 s blockquote br p font h3 dfn a h4 sub sup span abbr q'],
        ['.//*[@title mod 2 = 0]', 'dl dd h1 strong b blockquote p ins h3 a sub span q'],


        ['.//blockquote/child::*[last()]', 'font'],
        ['.//blockquote/descendant::*[position() < 4]', 'br p del'],
        ['id(.//font/@face)', 'strong q'],
        ['.//*[name(.) = "sub"]', 'sub'],
        ['.//*[name() = "sub"]', 'sub'],


        ['.//blockquote/child::*[2]', 'p'],
        ['.//blockquote/descendant::*[4]', 'ins'],
        ['.//blockquote/descendant-or-self::*[4]', 'del'],
        ['.//blockquote/ancestor::*[2]', 'div'],
        ['.//blockquote/ancestor-or-self::*[2]', 'center'],
        ['.//blockquote/following-sibling::*[1]', 'h3'],
        ['.//blockquote/preceding-sibling::*[1]', 'h2'],
        ['.//blockquote/following::*[4]', 'h4'],
        ['.//blockquote/preceding::*[4]', 'strong'],

        
        ['.//*[starts-with(.,"s")]', 'strong s h4 sub sup'],
        ['.//*[string(@title - 1) = "0"]', 'div'],
        ['.//*[string() = "sub"]', 'sub'],
        ['.//*[string(.) = "sub"]', 'sub'],
        ['.//*[concat(.,..) = "subsubsup"]', 'sub'],
        ['.//node()[concat(.,..,../..) = "bbbs"]', 't:b'],
        ['.//*[starts-with(.,"s")]', 'strong s h4 sub sup'],
        ['.//*[substring-before(.,"u") = "s"]', 'h4 sub sup'],
        ['.//*[substring-after(.,"on") = "t"]', 'blockquote font'],
        ['.//*[substring(.,2,1) = "u"]', 'h4 sub sup'],
        ['.//*[substring(.,2) = "up"]', 'sup'],
        ['.//*[contains(.,"b")]', 'div center h2 b blockquote h4 sub span abbr'],
        ['.//*[string-length() = 3]', 'del ins dfn sub sup'],
        ['.//*[string-length(.) = 3]', 'del ins dfn sub sup'],
        ['.//*[.=translate(normalize-space("  s  u  b  ")," ","")]', 'sub'],
        ['.//*[normalize-space()="q"]', 'q'],


        ['.//*[boolean(@title - 1) = false()]', 'div'],
        ['.//*[not(@title - 1) = true()]', 'div'],
        ['.//*[lang("it")]', 'q'],


        ['.//*[number(@title) < number(@class)]', 'div dl center blockquote span'],
        ['.//*[sum(ancestor::*/@title) < sum(descendant::*/@title)]',
                'div dl center h1 h2 blockquote p h3 h4 span'],
        ['.//*[floor(@title div @class) = 1]',
                'h1 em strong h2 b s br p del ins font h3 dfn a h4 sub sup abbr q'],
        ['.//*[ceiling(@title div @class) = 1]', 'div dl center h1 h2 blockquote h3 h4 span'],
        ['.//*[round(@title div @class) = 1]',
                'dl h1 h2 b s blockquote br p del ins font h3 dfn a h4 sub sup span abbr q'],


        ['.//*[blockquote]', 'center']

テスト用HTMLは、解析でエラーにならないように、scriptの中を抜いただけ。

<html>
 <head>
  <title>XPath Test</title>
  <script>
  </script>
 </head>
 <body><div id="n1" title="1" class="26" xml:lang="en"><dl id="n2" title="2" class="3"><dt id="n3" title="3" class="1">dt</dt><dd id="n4" title="4" class="2">dd</dd></dl><center id="n5" title="5" class="22"><h1 id="n6" title="6" class="6"><em id="n7" title="7" class="4">em</em><strong id="n8" title="8" class="5">strong</strong></h1><h2 id="n9" title="9" class="9"><b id="n10" title="10" class="7">b</b><s id="n11" title="11" class="8">s</s></h2><blockquote id="n12" title="12" class="15"><!--blockquoteComment-->blockquoteText1:<br id="n13" title="13" class="10"/>blockquoteText2<p id="n14" title="14" class="13"><del id="n15" title="15" class="11">del</del><ins id="n16" title="16" class="12">ins</ins></p><?pi name="value"?><font id="n17" title="17" class="14" face="n8 n26">font</font></blockquote><h3 id="n18" title="18" class="18"><dfn id="n19" title="19" class="16">dfn</dfn><a id="n20" title="20" class="17">a</a></h3><h4 id="n21" title="21" class="21"><sub id="n22" title="22" class="19">sub</sub><sup id="n23" title="23" class="20">sup</sup></h4></center><span id="n24" title="24" class="25"><abbr id="n25" title="25" class="23">abbr</abbr><q id="n26" title="26" class="24" cite="n8 n17" xml:lang="it">q</q></span></div></body>
</html>

以下、実行結果。
環境は、MacOS X(Leopard)のJDK1.5。

[ok].//blockquote/* 34ms
[ok].//blockquote/child::* 7ms
[ok].//blockquote/parent::* 5ms
[ok].//blockquote/descendant::* 6ms
[ok].//blockquote/descendant-or-self::* 5ms
[ok].//blockquote/ancestor::* 5ms
[ok].//blockquote/ancestor-or-self::* 8ms
[ok].//blockquote/following-sibling::* 7ms
[ok].//blockquote/preceding-sibling::* 5ms
[ok].//blockquote/following::* 8ms
[ok].//blockquote/preceding::* 6ms
[ok].//blockquote/self::* 3ms
[ok].//blockquote/attribute::id/parent::* 4ms
[ok].//blockquote/@id/parent::* 3ms
[ok].//*[blockquote] 1ms
[ok].//*[child::blockquote] 3ms
[ok].//*[parent::blockquote] 1ms
[ng].//*[descendant::blockquote] 1ms
expected : [div, center], value : [html, body, div, center]
[ng].//*[descendant-or-self::blockquote] 1ms
expected : [div, center, blockquote], value : [html, body, div, center, blockquote]
[ok].//*[ancestor::blockquote] 5ms
[ok].//*[ancestor-or-self::blockquote] 1ms
[ok].//*[following-sibling::blockquote] 1ms
[ok].//*[preceding-sibling::blockquote] 1ms
[ng].//*[following::blockquote] 2ms
expected : [dl, dt, dd, h1, em, strong, h2, b, s], value : [head, title, script, dl, dt, dd, h1, em, strong, h2, b, s]
[ok].//*[preceding::blockquote] 1ms
[ok].//*[self::blockquote] 1ms
[ok].//*[@id] 2ms
[ok].//*[attribute::id] 1ms
[ng].//blockquote/text() 3ms
expected : [t:blockquoteText1:, t:blockquoteText2], value : [#text, #text]
[ng].//blockquote/comment() 2ms
expected : [c:blockquoteComment], value : [#comment]
[ng].//blockquote/processing-instruction() 2ms
expected : [p:pi], value : [pi]
[ng].//blockquote/processing-instruction("pi") 2ms
expected : [p:pi], value : [pi]
[ng].//blockquote/node() 5ms
expected : [c:blockquoteComment, t:blockquoteText1:, br, t:blockquoteText2, p, p:pi, font], value : [#comment, #text, br, #text, p, pi, font]
[ok].//blockquote/p 2ms
[ok].//*[child::* and preceding::font] 4ms
[ok].//*[not(child::*) and preceding::font] 6ms
[ok].//blockquote/ancestor::* | .//blockquote/descendant::* 2ms
[ok].//*[.="sub"] 2ms
[ok].//*[@title > 12 and @class < 15] 3ms
[ok].//*[@title mod 2 = 0] 5ms
[ok].//blockquote/child::*[last()] 2ms
[ok].//blockquote/descendant::*[position() < 4] 2ms
[ng]id(.//font/@face) 3ms
expected : [strong, q], value : []
[ok].//*[name(.) = "sub"] 4ms
[ok].//*[name() = "sub"] 1ms
[ok].//blockquote/child::*[2] 2ms
[ok].//blockquote/descendant::*[4] 1ms
[ok].//blockquote/descendant-or-self::*[4] 2ms
[ok].//blockquote/ancestor::*[2] 5ms
[ok].//blockquote/ancestor-or-self::*[2] 2ms
[ok].//blockquote/following-sibling::*[1] 2ms
[ok].//blockquote/preceding-sibling::*[1] 2ms
[ok].//blockquote/following::*[4] 4ms
[ok].//blockquote/preceding::*[4] 2ms
[ok].//*[starts-with(.,"s")] 2ms
[ok].//*[string(@title - 1) = "0"] 3ms
[ok].//*[string() = "sub"] 1ms
[ok].//*[string(.) = "sub"] 2ms
[ok].//*[concat(.,..) = "subsubsup"] 2ms
[ng].//node()[concat(.,..,../..) = "bbbs"] 2ms
expected : [t:b], value : [#text]
[ok].//*[substring-before(.,"u") = "s"] 2ms
[ok].//*[substring-after(.,"on") = "t"] 1ms
[ok].//*[substring(.,2,1) = "u"] 3ms
[ok].//*[substring(.,2) = "up"] 1ms
[ng].//*[contains(.,"b")] 2ms
expected : [div, center, h2, b, blockquote, h4, sub, span, abbr], value : [html, body, div, center, h2, b, blockquote, h4, sub, span, abbr]
[ok].//*[string-length() = 3] 1ms
[ok].//*[string-length(.) = 3] 1ms
[ok].//*[.=translate(normalize-space("  s  u  b  ")," ","")] 2ms
[ok].//*[normalize-space()="q"] 2ms
[ng].//*[boolean(@title - 1) = false()] 3ms
expected : [div], value : [html, head, title, script, body, div]
[ng].//*[not(@title - 1) = true()] 4ms
expected : [div], value : [html, head, title, script, body, div]
[ok].//*[lang("it")] 2ms
[ok].//*[number(@title) < number(@class)] 2ms
[ok].//*[ceiling(@title div @class) = 1] 3ms

結構、サポートされているのも多いので、ぬこたんとか、多少汚いHTMLでもパースしてくれるものを使えば、Web::Scraper相当のものを作れそうな感じ。<<追記>>
Windows環境のJDK1.5.0_12だと以下のような結果。なんかOSXでかかった時間と大分違うのが気になるところ。

[ok].//blockquote/* 78ms
[ok].//blockquote/child::* 16ms
[ok].//blockquote/parent::* 16ms
[ok].//blockquote/descendant::* 0ms
[ok].//blockquote/descendant-or-self::* 15ms
[ok].//blockquote/ancestor::* 0ms
[ok].//blockquote/ancestor-or-self::* 16ms
[ok].//blockquote/following-sibling::* 16ms
[ok].//blockquote/preceding-sibling::* 15ms
[ok].//blockquote/following::* 16ms
[ok].//blockquote/preceding::* 0ms
[ok].//blockquote/self::* 15ms
[ok].//blockquote/attribute::id/parent::* 0ms
[ok].//blockquote/@id/parent::* 16ms
[ok].//*[blockquote] 0ms
[ok].//*[child::blockquote] 16ms
[ok].//*[parent::blockquote] 0ms
[ng].//*[descendant::blockquote] 0ms
expected : [div, center], value : [html, body, div, center]
[ng].//*[descendant-or-self::blockquote] 0ms
expected : [div, center, blockquote], value : [html, body, div, center, blockquote]
[ok].//*[ancestor::blockquote] 0ms
[ok].//*[ancestor-or-self::blockquote] 0ms
[ok].//*[following-sibling::blockquote] 0ms
[ok].//*[preceding-sibling::blockquote] 0ms
[ng].//*[following::blockquote] 0ms
expected : [dl, dt, dd, h1, em, strong, h2, b, s], value : [head, title, script, dl, dt, dd, h1, em, strong, h2, b, s]
[ok].//*[preceding::blockquote] 0ms
[ok].//*[self::blockquote] 0ms
[ok].//*[@id] 0ms
[ok].//*[attribute::id] 0ms
[ng].//blockquote/text() 15ms
expected : [t:blockquoteText1:, t:blockquoteText2], value : [#text, #text, #text]
[ng].//blockquote/comment() 0ms
expected : [c:blockquoteComment], value : [#comment]
[ng].//blockquote/processing-instruction() 0ms
expected : [p:pi], value : [pi]
[ng].//blockquote/processing-instruction("pi") 32ms
expected : [p:pi], value : [pi]
[ng].//blockquote/node() 0ms
expected : [c:blockquoteComment, t:blockquoteText1:, br, t:blockquoteText2, p, p:pi, font], value : [#comment, #text, br, #text, p, #text, pi, font]
[ok].//blockquote/p 0ms
[ok].//*[child::* and preceding::font] 0ms
[ok].//*[not(child::*) and preceding::font] 15ms
[ok].//blockquote/ancestor::* | .//blockquote/descendant::* 0ms
[ok].//*[.="sub"] 0ms
[ok].//*[@title > 12 and @class < 15] 0ms
[ok].//*[@title mod 2 = 0] 16ms
[ok].//blockquote/child::*[last()] 0ms
[ok].//blockquote/descendant::*[position() < 4] 0ms
[ng]id(.//font/@face) 0ms
expected : [strong, q], value : []
[ok].//*[name(.) = "sub"] 0ms
[ok].//*[name() = "sub"] 16ms
[ok].//blockquote/child::*[2] 0ms
[ok].//blockquote/descendant::*[4] 0ms
[ok].//blockquote/descendant-or-self::*[4] 0ms
[ok].//blockquote/ancestor::*[2] 0ms
[ok].//blockquote/ancestor-or-self::*[2] 15ms
[ok].//blockquote/following-sibling::*[1] 0ms
[ok].//blockquote/preceding-sibling::*[1] 0ms
[ok].//blockquote/following::*[4] 0ms
[ok].//blockquote/preceding::*[4] 16ms
[ok].//*[starts-with(.,"s")] 0ms
[ok].//*[string(@title - 1) = "0"] 0ms
[ok].//*[string() = "sub"] 0ms
[ok].//*[string(.) = "sub"] 0ms
[ok].//*[concat(.,..) = "subsubsup"] 0ms
[ng].//node()[concat(.,..,../..) = "bbbs"] 16ms
expected : [t:b], value : [#text]
[ok].//*[substring-before(.,"u") = "s"] 0ms
[ok].//*[substring-after(.,"on") = "t"] 0ms
[ok].//*[substring(.,2,1) = "u"] 15ms
[ok].//*[substring(.,2) = "up"] 0ms
[ng].//*[contains(.,"b")] 0ms
expected : [div, center, h2, b, blockquote, h4, sub, span, abbr], value : [html, body, div, center, h2, b, blockquote, h4, sub, span, abbr]
[ng].//*[string-length() = 3] 0ms
expected : [del, ins, dfn, sub, sup], value : [script, del, ins, dfn, sub, sup]
[ng].//*[string-length(.) = 3] 16ms
expected : [del, ins, dfn, sub, sup], value : [script, del, ins, dfn, sub, sup]
[ok].//*[.=translate(normalize-space("  s  u  b  ")," ","")] 0ms
[ok].//*[normalize-space()="q"] 0ms
[ng].//*[boolean(@title - 1) = false()] 16ms
expected : [div], value : [html, head, title, script, body, div]
[ng].//*[not(@title - 1) = true()] 0ms
expected : [div], value : [html, head, title, script, body, div]
[ok].//*[lang("it")] 0ms
[ok].//*[number(@title) < number(@class)] 0ms
[ok].//*[ceiling(@title div @class) = 1] 0ms