amachangのXPath機能テストで、JavaのXPathAPIのテストをしてみた。
本当は、HTMLを取得して、解析しようと思ったが、xhtmlじゃないので、DocumentBuilderでの解析に失敗するので、対処するのが面倒なのと、HttpClientとか使うと依存が増えるので、とりあえず、該当箇所だけ引っぱり出してテストしてみた。
package xpath; import com.sun.org.apache.xpath.internal.XPathAPI; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import junit.framework.TestCase; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.traversal.NodeIterator; import org.xml.sax.SAXException; public class XPathCoverageTest extends TestCase{ public void test_checkCoverage() { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); // 解析対象のHTMLの読み込み Document doc = builder.parse(getClass().getResourceAsStream("test.html")); //テスト対象のケース取得 Map<String, List<String>> testData = readData("testData.txt"); for (String key : testData.keySet()) { long start = System.currentTimeMillis(); // NodeIteratorの取得 NodeIterator nl = XPathAPI.selectNodeIterator(doc, key); long processTime = System.currentTimeMillis() - start; Node node; List<String> nodeList = new LinkedList<String>(); while ((node = nl.nextNode()) != null) { nodeList.add(node.getNodeName()); } boolean isValid = testData.get(key).equals(nodeList); if (isValid) { System.out.println("[ok]" + key + " " + processTime + "ms"); } else { System.out.println("[ng]" + key + " " + processTime + "ms"); System.out.println("expected : " + testData.get(key) + ", value : " + nodeList.toString()); } } } catch (SAXException e) { } catch (IOException e) { } catch (ParserConfigurationException e) { } catch (TransformerException e) { } } /**左側の空白文字削除用パターン*/ private final Pattern LEFT_TRIM_PATTERN = Pattern.compile("^[\\s ]+"); /**右側の文字削除用パターン*/ private final Pattern RIGHT_TRIM_PATTERN = Pattern.compile("[\\s ,]+$"); /**パラメーター解析用パターン*/ private final Pattern PARAMTER_PATTERN = Pattern.compile("^\\['(.+)',\\s'(.+)'\\]"); private String[] parse(String params) { params = LEFT_TRIM_PATTERN.matcher( RIGHT_TRIM_PATTERN.matcher(params).replaceAll("")).replaceAll( ""); List<String> paramList = new LinkedList<String>(); Matcher matcher = PARAMTER_PATTERN.matcher(params); if (matcher.matches()) { for (int i = 0; i < matcher.groupCount(); i++) { paramList.add(matcher.group(i + 1)); } } return (String[]) paramList.toArray(new String[paramList.size()]); } /** * テスト用データ読み込み * @param dataFileName * @return */ private Map<String, List<String>> readData(String dataFileName) { InputStream input = null; Map<String, List<String>> data = new LinkedHashMap<String, List<String>>(); try { input = getClass().getResourceAsStream(dataFileName); BufferedReader reader = new BufferedReader(new InputStreamReader( input)); while (reader.ready()) { String[] params = null; if ((params = parse(reader.readLine())).length == 2) { data.put(params[0], Arrays.asList(params[1].split(" "))); } } } catch (IOException e) { } finally { if (input != null) { try { input.close(); } catch (IOException e) { } } } return data; } }
テスト対象のデータ(testData.txt)。http://amachang.art-code.org/xpath_functional_test/から、機能テスト用のデータ部分だけ抽出。
['.//blockquote/*', 'br p font'], ['.//blockquote/child::*', 'br p font'], ['.//blockquote/parent::*', 'center'], ['.//blockquote/descendant::*', 'br p del ins font'], ['.//blockquote/descendant-or-self::*', 'blockquote br p del ins font'], ['.//blockquote/ancestor::*', 'html body div center'], ['.//blockquote/ancestor-or-self::*', 'html body div center blockquote'], ['.//blockquote/following-sibling::*', 'h3 h4'], ['.//blockquote/preceding-sibling::*', 'h1 h2'], ['.//blockquote/following::*', 'h3 dfn a h4 sub sup span abbr q'], ['.//blockquote/preceding::*', 'head title script dl dt dd h1 em strong h2 b s'], ['.//blockquote/self::*', 'blockquote'], ['.//blockquote/attribute::id/parent::*', 'blockquote'], ['.//blockquote/@id/parent::*', 'blockquote'], ['.//*[blockquote]', 'center'], ['.//*[child::blockquote]', 'center'], ['.//*[parent::blockquote]', 'br p font'], ['.//*[descendant::blockquote]', 'div center'], ['.//*[descendant-or-self::blockquote]', 'div center blockquote'], ['.//*[ancestor::blockquote]', 'br p del ins font'], ['.//*[ancestor-or-self::blockquote]', 'blockquote br p del ins font'], ['.//*[following-sibling::blockquote]', 'h1 h2'], ['.//*[preceding-sibling::blockquote]', 'h3 h4'], ['.//*[following::blockquote]', 'dl dt dd h1 em strong h2 b s'], ['.//*[preceding::blockquote]', 'h3 dfn a h4 sub sup span abbr q'], ['.//*[self::blockquote]', 'blockquote'], ['.//*[@id]', 'div dl dt dd center h1 em strong h2 b s blockquote br p del ins font h3 dfn a h4 sub sup span abbr q'], ['.//*[attribute::id]', 'div dl dt dd center h1 em strong h2 b s blockquote br p del ins font h3 dfn a h4 sub sup span abbr q'], ['.//blockquote/text()', 't:blockquoteText1: t:blockquoteText2'], ['.//blockquote/comment()', 'c:blockquoteComment'], ['.//blockquote/processing-instruction()', 'p:pi'], ['.//blockquote/processing-instruction("pi")', 'p:pi'], ['.//blockquote/node()', 'c:blockquoteComment t:blockquoteText1: br t:blockquoteText2 p p:pi font'], ['.//blockquote/p', 'p'], ['.//blockquote/*', 'br p font'], ['.//*[child::* and preceding::font]', 'h3 h4 span'], ['.//*[not(child::*) and preceding::font]', 'dfn a sub sup abbr q'], ['.//*[preceding::blockquote or following::blockquote]', 'dl dt dd h1 em strong h2 b s h3 dfn a h4 sub sup span abbr q'], ['.//blockquote/ancestor::* | .//blockquote/descendant::*', 'html body div center br p del ins font'], ['.//*[.="sub"]', 'sub'], ['.//*[@title > 12 and @class < 15]', 'br p del ins font'], ['.//*[@title != @class]', 'div dl dt dd center em strong b s blockquote br p del ins font dfn a sub sup span abbr q'], ['.//*[((@class * @class + @title * @title) div (@class + @title)) > ((@class - @title) * (@class - @title))]', 'dl h1 h2 s blockquote br p font h3 dfn a h4 sub sup span abbr q'], ['.//*[@title mod 2 = 0]', 'dl dd h1 strong b blockquote p ins h3 a sub span q'], ['.//blockquote/child::*[last()]', 'font'], ['.//blockquote/descendant::*[position() < 4]', 'br p del'], ['id(.//font/@face)', 'strong q'], ['.//*[name(.) = "sub"]', 'sub'], ['.//*[name() = "sub"]', 'sub'], ['.//blockquote/child::*[2]', 'p'], ['.//blockquote/descendant::*[4]', 'ins'], ['.//blockquote/descendant-or-self::*[4]', 'del'], ['.//blockquote/ancestor::*[2]', 'div'], ['.//blockquote/ancestor-or-self::*[2]', 'center'], ['.//blockquote/following-sibling::*[1]', 'h3'], ['.//blockquote/preceding-sibling::*[1]', 'h2'], ['.//blockquote/following::*[4]', 'h4'], ['.//blockquote/preceding::*[4]', 'strong'], ['.//*[starts-with(.,"s")]', 'strong s h4 sub sup'], ['.//*[string(@title - 1) = "0"]', 'div'], ['.//*[string() = "sub"]', 'sub'], ['.//*[string(.) = "sub"]', 'sub'], ['.//*[concat(.,..) = "subsubsup"]', 'sub'], ['.//node()[concat(.,..,../..) = "bbbs"]', 't:b'], ['.//*[starts-with(.,"s")]', 'strong s h4 sub sup'], ['.//*[substring-before(.,"u") = "s"]', 'h4 sub sup'], ['.//*[substring-after(.,"on") = "t"]', 'blockquote font'], ['.//*[substring(.,2,1) = "u"]', 'h4 sub sup'], ['.//*[substring(.,2) = "up"]', 'sup'], ['.//*[contains(.,"b")]', 'div center h2 b blockquote h4 sub span abbr'], ['.//*[string-length() = 3]', 'del ins dfn sub sup'], ['.//*[string-length(.) = 3]', 'del ins dfn sub sup'], ['.//*[.=translate(normalize-space(" s u b ")," ","")]', 'sub'], ['.//*[normalize-space()="q"]', 'q'], ['.//*[boolean(@title - 1) = false()]', 'div'], ['.//*[not(@title - 1) = true()]', 'div'], ['.//*[lang("it")]', 'q'], ['.//*[number(@title) < number(@class)]', 'div dl center blockquote span'], ['.//*[sum(ancestor::*/@title) < sum(descendant::*/@title)]', 'div dl center h1 h2 blockquote p h3 h4 span'], ['.//*[floor(@title div @class) = 1]', 'h1 em strong h2 b s br p del ins font h3 dfn a h4 sub sup abbr q'], ['.//*[ceiling(@title div @class) = 1]', 'div dl center h1 h2 blockquote h3 h4 span'], ['.//*[round(@title div @class) = 1]', 'dl h1 h2 b s blockquote br p del ins font h3 dfn a h4 sub sup span abbr q'], ['.//*[blockquote]', 'center']
テスト用HTMLは、解析でエラーにならないように、scriptの中を抜いただけ。
<html> <head> <title>XPath Test</title> <script> </script> </head> <body><div id="n1" title="1" class="26" xml:lang="en"><dl id="n2" title="2" class="3"><dt id="n3" title="3" class="1">dt</dt><dd id="n4" title="4" class="2">dd</dd></dl><center id="n5" title="5" class="22"><h1 id="n6" title="6" class="6"><em id="n7" title="7" class="4">em</em><strong id="n8" title="8" class="5">strong</strong></h1><h2 id="n9" title="9" class="9"><b id="n10" title="10" class="7">b</b><s id="n11" title="11" class="8">s</s></h2><blockquote id="n12" title="12" class="15"><!--blockquoteComment-->blockquoteText1:<br id="n13" title="13" class="10"/>blockquoteText2<p id="n14" title="14" class="13"><del id="n15" title="15" class="11">del</del><ins id="n16" title="16" class="12">ins</ins></p><?pi name="value"?><font id="n17" title="17" class="14" face="n8 n26">font</font></blockquote><h3 id="n18" title="18" class="18"><dfn id="n19" title="19" class="16">dfn</dfn><a id="n20" title="20" class="17">a</a></h3><h4 id="n21" title="21" class="21"><sub id="n22" title="22" class="19">sub</sub><sup id="n23" title="23" class="20">sup</sup></h4></center><span id="n24" title="24" class="25"><abbr id="n25" title="25" class="23">abbr</abbr><q id="n26" title="26" class="24" cite="n8 n17" xml:lang="it">q</q></span></div></body> </html>
以下、実行結果。
環境は、MacOS X(Leopard)のJDK1.5。
[ok].//blockquote/* 34ms [ok].//blockquote/child::* 7ms [ok].//blockquote/parent::* 5ms [ok].//blockquote/descendant::* 6ms [ok].//blockquote/descendant-or-self::* 5ms [ok].//blockquote/ancestor::* 5ms [ok].//blockquote/ancestor-or-self::* 8ms [ok].//blockquote/following-sibling::* 7ms [ok].//blockquote/preceding-sibling::* 5ms [ok].//blockquote/following::* 8ms [ok].//blockquote/preceding::* 6ms [ok].//blockquote/self::* 3ms [ok].//blockquote/attribute::id/parent::* 4ms [ok].//blockquote/@id/parent::* 3ms [ok].//*[blockquote] 1ms [ok].//*[child::blockquote] 3ms [ok].//*[parent::blockquote] 1ms [ng].//*[descendant::blockquote] 1ms expected : [div, center], value : [html, body, div, center] [ng].//*[descendant-or-self::blockquote] 1ms expected : [div, center, blockquote], value : [html, body, div, center, blockquote] [ok].//*[ancestor::blockquote] 5ms [ok].//*[ancestor-or-self::blockquote] 1ms [ok].//*[following-sibling::blockquote] 1ms [ok].//*[preceding-sibling::blockquote] 1ms [ng].//*[following::blockquote] 2ms expected : [dl, dt, dd, h1, em, strong, h2, b, s], value : [head, title, script, dl, dt, dd, h1, em, strong, h2, b, s] [ok].//*[preceding::blockquote] 1ms [ok].//*[self::blockquote] 1ms [ok].//*[@id] 2ms [ok].//*[attribute::id] 1ms [ng].//blockquote/text() 3ms expected : [t:blockquoteText1:, t:blockquoteText2], value : [#text, #text] [ng].//blockquote/comment() 2ms expected : [c:blockquoteComment], value : [#comment] [ng].//blockquote/processing-instruction() 2ms expected : [p:pi], value : [pi] [ng].//blockquote/processing-instruction("pi") 2ms expected : [p:pi], value : [pi] [ng].//blockquote/node() 5ms expected : [c:blockquoteComment, t:blockquoteText1:, br, t:blockquoteText2, p, p:pi, font], value : [#comment, #text, br, #text, p, pi, font] [ok].//blockquote/p 2ms [ok].//*[child::* and preceding::font] 4ms [ok].//*[not(child::*) and preceding::font] 6ms [ok].//blockquote/ancestor::* | .//blockquote/descendant::* 2ms [ok].//*[.="sub"] 2ms [ok].//*[@title > 12 and @class < 15] 3ms [ok].//*[@title mod 2 = 0] 5ms [ok].//blockquote/child::*[last()] 2ms [ok].//blockquote/descendant::*[position() < 4] 2ms [ng]id(.//font/@face) 3ms expected : [strong, q], value : [] [ok].//*[name(.) = "sub"] 4ms [ok].//*[name() = "sub"] 1ms [ok].//blockquote/child::*[2] 2ms [ok].//blockquote/descendant::*[4] 1ms [ok].//blockquote/descendant-or-self::*[4] 2ms [ok].//blockquote/ancestor::*[2] 5ms [ok].//blockquote/ancestor-or-self::*[2] 2ms [ok].//blockquote/following-sibling::*[1] 2ms [ok].//blockquote/preceding-sibling::*[1] 2ms [ok].//blockquote/following::*[4] 4ms [ok].//blockquote/preceding::*[4] 2ms [ok].//*[starts-with(.,"s")] 2ms [ok].//*[string(@title - 1) = "0"] 3ms [ok].//*[string() = "sub"] 1ms [ok].//*[string(.) = "sub"] 2ms [ok].//*[concat(.,..) = "subsubsup"] 2ms [ng].//node()[concat(.,..,../..) = "bbbs"] 2ms expected : [t:b], value : [#text] [ok].//*[substring-before(.,"u") = "s"] 2ms [ok].//*[substring-after(.,"on") = "t"] 1ms [ok].//*[substring(.,2,1) = "u"] 3ms [ok].//*[substring(.,2) = "up"] 1ms [ng].//*[contains(.,"b")] 2ms expected : [div, center, h2, b, blockquote, h4, sub, span, abbr], value : [html, body, div, center, h2, b, blockquote, h4, sub, span, abbr] [ok].//*[string-length() = 3] 1ms [ok].//*[string-length(.) = 3] 1ms [ok].//*[.=translate(normalize-space(" s u b ")," ","")] 2ms [ok].//*[normalize-space()="q"] 2ms [ng].//*[boolean(@title - 1) = false()] 3ms expected : [div], value : [html, head, title, script, body, div] [ng].//*[not(@title - 1) = true()] 4ms expected : [div], value : [html, head, title, script, body, div] [ok].//*[lang("it")] 2ms [ok].//*[number(@title) < number(@class)] 2ms [ok].//*[ceiling(@title div @class) = 1] 3ms
結構、サポートされているのも多いので、ぬこたんとか、多少汚いHTMLでもパースしてくれるものを使えば、Web::Scraper相当のものを作れそうな感じ。<<追記>>
Windows環境のJDK1.5.0_12だと以下のような結果。なんかOSXでかかった時間と大分違うのが気になるところ。
[ok].//blockquote/* 78ms [ok].//blockquote/child::* 16ms [ok].//blockquote/parent::* 16ms [ok].//blockquote/descendant::* 0ms [ok].//blockquote/descendant-or-self::* 15ms [ok].//blockquote/ancestor::* 0ms [ok].//blockquote/ancestor-or-self::* 16ms [ok].//blockquote/following-sibling::* 16ms [ok].//blockquote/preceding-sibling::* 15ms [ok].//blockquote/following::* 16ms [ok].//blockquote/preceding::* 0ms [ok].//blockquote/self::* 15ms [ok].//blockquote/attribute::id/parent::* 0ms [ok].//blockquote/@id/parent::* 16ms [ok].//*[blockquote] 0ms [ok].//*[child::blockquote] 16ms [ok].//*[parent::blockquote] 0ms [ng].//*[descendant::blockquote] 0ms expected : [div, center], value : [html, body, div, center] [ng].//*[descendant-or-self::blockquote] 0ms expected : [div, center, blockquote], value : [html, body, div, center, blockquote] [ok].//*[ancestor::blockquote] 0ms [ok].//*[ancestor-or-self::blockquote] 0ms [ok].//*[following-sibling::blockquote] 0ms [ok].//*[preceding-sibling::blockquote] 0ms [ng].//*[following::blockquote] 0ms expected : [dl, dt, dd, h1, em, strong, h2, b, s], value : [head, title, script, dl, dt, dd, h1, em, strong, h2, b, s] [ok].//*[preceding::blockquote] 0ms [ok].//*[self::blockquote] 0ms [ok].//*[@id] 0ms [ok].//*[attribute::id] 0ms [ng].//blockquote/text() 15ms expected : [t:blockquoteText1:, t:blockquoteText2], value : [#text, #text, #text] [ng].//blockquote/comment() 0ms expected : [c:blockquoteComment], value : [#comment] [ng].//blockquote/processing-instruction() 0ms expected : [p:pi], value : [pi] [ng].//blockquote/processing-instruction("pi") 32ms expected : [p:pi], value : [pi] [ng].//blockquote/node() 0ms expected : [c:blockquoteComment, t:blockquoteText1:, br, t:blockquoteText2, p, p:pi, font], value : [#comment, #text, br, #text, p, #text, pi, font] [ok].//blockquote/p 0ms [ok].//*[child::* and preceding::font] 0ms [ok].//*[not(child::*) and preceding::font] 15ms [ok].//blockquote/ancestor::* | .//blockquote/descendant::* 0ms [ok].//*[.="sub"] 0ms [ok].//*[@title > 12 and @class < 15] 0ms [ok].//*[@title mod 2 = 0] 16ms [ok].//blockquote/child::*[last()] 0ms [ok].//blockquote/descendant::*[position() < 4] 0ms [ng]id(.//font/@face) 0ms expected : [strong, q], value : [] [ok].//*[name(.) = "sub"] 0ms [ok].//*[name() = "sub"] 16ms [ok].//blockquote/child::*[2] 0ms [ok].//blockquote/descendant::*[4] 0ms [ok].//blockquote/descendant-or-self::*[4] 0ms [ok].//blockquote/ancestor::*[2] 0ms [ok].//blockquote/ancestor-or-self::*[2] 15ms [ok].//blockquote/following-sibling::*[1] 0ms [ok].//blockquote/preceding-sibling::*[1] 0ms [ok].//blockquote/following::*[4] 0ms [ok].//blockquote/preceding::*[4] 16ms [ok].//*[starts-with(.,"s")] 0ms [ok].//*[string(@title - 1) = "0"] 0ms [ok].//*[string() = "sub"] 0ms [ok].//*[string(.) = "sub"] 0ms [ok].//*[concat(.,..) = "subsubsup"] 0ms [ng].//node()[concat(.,..,../..) = "bbbs"] 16ms expected : [t:b], value : [#text] [ok].//*[substring-before(.,"u") = "s"] 0ms [ok].//*[substring-after(.,"on") = "t"] 0ms [ok].//*[substring(.,2,1) = "u"] 15ms [ok].//*[substring(.,2) = "up"] 0ms [ng].//*[contains(.,"b")] 0ms expected : [div, center, h2, b, blockquote, h4, sub, span, abbr], value : [html, body, div, center, h2, b, blockquote, h4, sub, span, abbr] [ng].//*[string-length() = 3] 0ms expected : [del, ins, dfn, sub, sup], value : [script, del, ins, dfn, sub, sup] [ng].//*[string-length(.) = 3] 16ms expected : [del, ins, dfn, sub, sup], value : [script, del, ins, dfn, sub, sup] [ok].//*[.=translate(normalize-space(" s u b ")," ","")] 0ms [ok].//*[normalize-space()="q"] 0ms [ng].//*[boolean(@title - 1) = false()] 16ms expected : [div], value : [html, head, title, script, body, div] [ng].//*[not(@title - 1) = true()] 0ms expected : [div], value : [html, head, title, script, body, div] [ok].//*[lang("it")] 0ms [ok].//*[number(@title) < number(@class)] 0ms [ok].//*[ceiling(@title div @class) = 1] 0ms