【问题标题】:Grouping and counting in XqueryXquery 中的分组和计数
【发布时间】:2012-03-13 03:53:13
【问题描述】:

听到的是 XML。我正在尝试获取 作者2012 年 15 月 2 日至 2012 年 2 月 24 日 的日期范围内发布的标题数量从高到低(标题数量)。

<entries>
<entry>
    <id>1</id>
    <published>23/02/2012</published>
    <title>Title 1</title>
    <content type="html">This is title one</content>
    <author>
        <name>Pankaj</name>
    </author>
</entry>
<entry>
    <id>2</id>
    <published>22/02/2012</published>
    <title>Title 2</title>
    <content type="html">This is title two</content>
    <author>
        <name>Pankaj</name>
    </author>
</entry>
<entry>
    <id>3</id>
    <published>21/02/2012</published>
    <title>Title 3</title>
    <content type="html">This is title three</content>
    <author>
        <name>Rob</name>
    </author>
</entry>
<entry>
    <id>4</id>
    <published>20/02/2012</published>
    <title>Title 4</title>
    <content type="html">This is title four</content>
    <author>
        <name>Bob</name>
    </author>
</entry>
<entry>
    <id>5</id>
    <published>19/02/2012</published>
    <title>Title 1</title>
    <content type="html">This is title five</content>
    <author>
        <name>Pankaj</name>
    </author>
</entry>

我正在尝试从 xquery 获取输出:

<?xml version="1.0" encoding="UTF-8"?>
<results>
<result>
    <author>
        <name>Pankaj</name>
    </author>
    <numberOfTitles>3</numberOfTitles>
</result>
<result>
    <author>
        <name>Rob</name>
    </author>
    <numberOfTitles>1</numberOfTitles>
</result>
<result>
    <author>
        <name>Bob</name>
    </author>
    <numberOfTitles>1</numberOfTitles>
</result>

请帮帮我..

【问题讨论】:

  • 这可能取决于您使用的 XQuery 版本。应该使用什么 XQuery 处理器/数据库来运行该查询?
  • 我正在使用氧气 (Saxon-PE Xquery9.2.0.6) 进行开发。最后我必须通过 Marklogic 上的 XCC api 运行这个查询。

标签: xquery marklogic


【解决方案1】:

此 XQuery 1.0 解决方案可由任何兼容的 XQuery 1.0 处理器执行

注意:不使用group bydistinct-values()

<results> 
 {
 let $entries := 
    /*/entry
           [for $d in 
                    xs:date(string-join(reverse(tokenize(published, '/')), '-'))
                return
                   xs:date('2012-02-15') le $d and $d le xs:date('2012-02-24')
             ],

  $vals := $entries/author/name
      return
         for $a in  $vals[index-of($vals, .)[1]],
                $cnt in count(index-of($vals, $a)) 
           order by $cnt descending
             return
              <result>
                <author>
                  {$a}
                 </author>
                 <numberOfTitles>
                   {count(index-of($vals, $a))}
                 </numberOfTitles>
              </result>
    }
</results>

应用于提供的 XML 文档时

<entries>
    <entry>
        <id>1</id>
        <published>23/02/2012</published>
        <title>Title 1</title>
        <content type="html">This is title one</content>
        <author>
            <name>Pankaj</name>
        </author>
    </entry>
    <entry>
        <id>2</id>
        <published>22/02/2012</published>
        <title>Title 2</title>
        <content type="html">This is title two</content>
        <author>
            <name>Pankaj</name>
        </author>
    </entry>
    <entry>
        <id>3</id>
        <published>21/02/2012</published>
        <title>Title 3</title>
        <content type="html">This is title three</content>
        <author>
            <name>Rob</name>
        </author>
    </entry>
    <entry>
        <id>4</id>
        <published>20/02/2012</published>
        <title>Title 4</title>
        <content type="html">This is title four</content>
        <author>
            <name>Bob</name>
        </author>
    </entry>
    <entry>
        <id>5</id>
        <published>19/02/2012</published>
        <title>Title 1</title>
        <content type="html">This is title five</content>
        <author>
            <name>Pankaj</name>
        </author>
    </entry>
</entries>

产生想要的正确结果

<?xml version="1.0" encoding="UTF-8"?>
<results>
   <result>
      <author>
         <name>Pankaj</name>
      </author>
      <numberOfTitles>3</numberOfTitles>
   </result>
   <result>
      <author>
         <name>Rob</name>
      </author>
      <numberOfTitles>1</numberOfTitles>
   </result>
   <result>
      <author>
         <name>Bob</name>
      </author>
      <numberOfTitles>1</numberOfTitles>
   </result>
</results>

【讨论】:

    【解决方案2】:

    这里有一个专门针对 MarkLogic 的解决方案,使用地图来有效地实现分组。输入 XML 已声明为 $INPUT,但您可以将其替换为对 doc() 或任何其他访问器的调用。

    我在去年的一篇博文中也探讨了这个话题:http://blakeley.com/blogofile/archives/560/

    element results {
      let $m := map:map()
      let $start := xs:date('2012-02-15')
      let $stop := xs:date('2012-02-24')
      let $group :=
        for $entry in $INPUT/entry
        let $key := $entry/author/name/string()
        let $date := xs:date(xdmp:parse-yymmdd("dd/MM/yyyy", $entry/published))
        where $date ge $start and $date le $stop
        return map:put($m, $key, 1 + (map:get($m, $key), 0)[1])
      for $key in map:keys($m)
      let $count := map:get($m, $key)
      order by $count
      return element result {
        element author { element name { $key }},
        element numberOfTitles { $count } } }
    

    【讨论】:

      【解决方案3】:

      这是我的解决方案:

      <results>{
        for $entry in //entry
        let $date := xs:date(string-join(reverse(tokenize($entry/published, '/')), '-')),
            $author := $entry/author/string()
        where xs:date('2012-02-15') le $date and $date le xs:date('2012-02-24')
        group by $author
        order by count($entry) descending
        return <result>{
          <author>
            <name>{$author}</name>
          </author>,
          <numberOfTitles>{count($entry)}</numberOfTitles>
        }</result>
      }</results>
      

      当使用BaseX 执行时,它会产生正确的结果。

      它使用XQuery 3.0 features like group by,否则会更复杂。我不知道 MarkLogic 是否支持。

      【讨论】:

      【解决方案4】:

      以下内容应该适用于大多数处理器。您可以在 MarkLogic 中进行更高效的查询,但这会让您开始。

      let $doc := <entries>
      <entry>
          <id>1</id>
          <published>23/02/2012</published>
          <title>Title 1</title>
          <content type="html">This is title one</content>
          <author>
              <name>Pankaj</name>
          </author>
      </entry>
      <entry>
          <id>2</id>
          <published>22/02/2012</published>
          <title>Title 2</title>
          <content type="html">This is title two</content>
          <author>
              <name>Pankaj</name>
          </author>
      </entry>
      <entry>
          <id>3</id>
          <published>21/02/2012</published>
          <title>Title 3</title>
          <content type="html">This is title three</content>
          <author>
              <name>Rob</name>
          </author>
      </entry>
      <entry>
          <id>4</id>
          <published>20/02/2012</published>
          <title>Title 4</title>
          <content type="html">This is title four</content>
          <author>
              <name>Bob</name>
          </author>
      </entry>
      <entry>
          <id>5</id>
          <published>19/02/2012</published>
          <title>Title 1</title>
          <content type="html">This is title five</content>
          <author>
              <name>Pankaj</name>
          </author>
      </entry>
      </entries>
      
      return
       <results>
          {
              for $author in distinct-values($doc/entry/author/name/string())
              return
              <result><author>
                  <name>{$author}</name>
                  <numberOfTitles>{count($doc/entry[author/name/string() eq $author])} </numberOfTitles>
              </author></result>
          }
       </results>
      

      【讨论】:

      • 您可以在 $doc/entry[author/name/string() eq $author and XXXX ] 等条目的谓词中添加日期约束;将 XXX 替换为解析您拥有的日期格式并进行必要比较的逻辑。
      • 这不会过滤日期,也不会排序,是吗?
      • 不,我很懒,但我会做一些类似于你的回答的事情。在谓词中添加另一个位以过滤到日期范围,然后按 count($doc/entry[author/name/string() eq $author]) 添加顺序进行排序。
      【解决方案5】:

      这是另一个类似于 Leo Wörteler 的解决方案:

      declare function local:FormatDate($origDate as xs:string) as xs:date 
        {
            xs:date(string-join(reverse(tokenize($origDate, '/')), '-'))
        };
      
      <results>
        {
        for $author in distinct-values(/entries/entry/author/name)
        let $startDate := xs:date('2012-02-15')
        let $endDate := xs:date('2012-02-24')
        order by count(/entries/entry[author/name=$author][$startDate <= local:FormatDate(published) and local:FormatDate(published) <= $endDate]) descending
        return
          <result>
            <author>
              <name>{$author}</name>
            </author>
            <numberOfTitles>{count(/entries/entry[author/name=$author][$startDate <= local:FormatDate(published) and local:FormatDate(published) <= $endDate])}</numberOfTitles>
          </result>
        }
      </results>
      

      【讨论】:

        【解决方案6】:

        基于地图的解决方案+1。其他解决方案有一个 count(/entry/author[$name=xx]) clause 或其他 XPath 嵌套在 FLWOR 内,这实际上是一个嵌套循环。嵌套循环会导致 O(N^2) 性能,这在测试中可能很好,但一旦数据大小增长就会变慢。

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多