top_k.pl

    1/*  Part of Extended Libraries for SWI-Prolog
    2
    3    Author:        Edison Mera
    4    E-mail:        efmera@gmail.com
    5    WWW:           https://github.com/edisonm/xlibrary
    6    Copyright (C): 2026, Process Design Center, Breda, The Netherlands.
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(top_k,
   36          [ top_k/3 % +Options, :Goal, -Result
   37          ]).   38
   39:- use_module(library(heaps)).   40:- use_module(library(assoc)).   41:- use_module(library(option)).   42:- use_module(library(solution_sequences)).   43
   44/* <Top-k selection problem>
   45
   46This module provides a predicate top_k/3, which is an efficient replacement of
   47the library(solution_sequences) when several sequencing options have to be
   48combined.
   49
   50The very well-known top-k selection problem says:
   51
   52- “Given a set of items, find the k items with the highest scores”
   53
   54Options:
   55  - limit(N)                        Optional, maximum number of solutions to process.
   56                                    Default is infinite.
   57  - order_by(asc(Key) | desc(Key))  Optional, Key is a term evaluated in Goal's context
   58  - distinct(W)                     term; skip solutions that bounds W to the same value.
   59                                    If not provided the original order is preserved
   60  - group_by(G)                     optional term; if given, do top-K per group, if not
   61                                    provided, it equates to put all in one single group
   62  - return(list(Term)|backtrack)    Returns the solutions as list of Term or backtracking
   63
   64semantically equivalent to:
   65    group_by(G, Goal, limit(N, distinct(W, order_by(OrderSpec, Goal))), GKGoalL),
   66    member(GK-GoalL, GKGoalL),
   67    member(Goal, GoalL).
   68
   69- Note that the chosen order of operations is not casual.
   70
   71- Performs better in terms of memory consumption.
   72
   73*/
   74
   75% Test:
   76
   77% ?- top_k([limit(3),distinct(Y),order_by(asc(X)), return(list(X-Y))], member(X-Y, [6-z,3-a,2-a,1-a,3-b,4-c,5-d]), L).
   78% L = [1-a, 3-b, 4-c].
   79
   80% To compare performance wrt library(solution_sequences):
   81
   82% whoo:
   83% ?- findall(E, between(1,1000,E),L),findall(E1-E2, limit(10, distinct(E1-E2, order_by([asc(E2)], ( append(_, [E1|T], L), member(E2, T) )))),L2),length(L2,S2).
   84% ERROR: Stack limit (1.0Gb) exceeded
   85
   86% whee:
   87% ?- time((findall(E, between(1,1000,E),L),top_k([limit(10),distinct(E1-E2),order(asc(E2)), return(list(E1-E2))],( append(_, [E1|T], L), member(E2, T) ), L2), length(L2, S2))).
   88% % 21,981,983 inferences, 0.825 CPU in 0.825 seconds (100% CPU, 26650647 Lips)
   89% ...
   90
   91:- meta_predicate top_k(+, 0, -).   92
   93top_k(Options1, Goal, Result) :-
   94    select_option(return(Return), Options1, Options, backtrack),
   95    top_k(Return, Options, Goal, Result).
   96
   97top_k(Return, Options, Goal, Result) :-
   98    (   Options = [_, _|_]
   99    ->  run_optimized(Return, Options, Goal, Result)
  100    ;   dispatch_singles(Return, Options, Goal, Result)
  101    ).
  102
  103dispatch_singles(backtrack,  Opts, Goal, Goal) :- dispatch_singles(Opts, Goal, _).
  104dispatch_singles(list(Term), Opts, Goal, List) :-
  105    option(group_by(Group), Opts, ungrouped),
  106    (   group_by(GK, Term, dispatch_singles(Opts, Goal, GK), List)
  107    *-> Group = GK % instantiate Group
  108    ;   ground(Group),
  109        List = [] % prevent failure if no solutions where found
  110    ).
  111
  112dispatch_singles([],    Goal, ungrouped) :- call(Goal).
  113dispatch_singles([Opt], Goal, GK) :- dispatch_single(Opt, Goal, GK).
  114
  115ordered_term_variables(Term, Vars) :-
  116    term_variables(Term, UVars),
  117    sort(UVars, Vars).
  118
  119dispatch_single(order_by(Spec),  Goal, ungrouped) :- order_by([Spec], Goal).
  120dispatch_single(limit(K),        Goal, ungrouped) :- limit(K, Goal).
  121dispatch_single(distinct(W),     Goal, ungrouped) :- distinct(W, Goal).
  122dispatch_single(group_by(Group), Goal, Group) :- dispatch_group_by(Goal, Group).
  123
  124dispatch_group_by(Goal, Group) :-
  125    ordered_term_variables(Goal, GVars),
  126    ordered_term_variables(Group, KVars),
  127    ord_subtract(GVars, KVars, TVars),
  128    Term =.. [v|TVars],
  129    bagof(Term, Goal, List),
  130    member(Term, List).
  131
  132run_optimized(Return, Opts, Goal, Result) :-
  133    option(limit(Count), Opts, inf),
  134    option(order_by(OrderSpec), Opts, asc(unordered)),
  135    option(group_by(Group), Opts, ungrouped),
  136    (   option(distinct(Witness), Opts)
  137    ->  Distinct = true
  138    ;   Distinct = false
  139    ),
  140    priority_for(OrderSpec, Pri, Key),
  141    run_optimized(Goal, Count, Pri, Key, Distinct, Witness, Group, Return, Result).
  142
  143/* ---------- ordering ---------- */
  144
  145% We store entries with a "priority" such that the heap root is the WORST
  146% among kept ones. Then replacement is cheap.
  147%
  148% For asc(Key): "best" = smallest Key. "worst" = largest Key.
  149% So make heap a max-heap on Key by using priority = Key and using a max-heap?
  150% library(heaps) is a min-heap, so emulate max-heap by negating an order key
  151% or by wrapping with a reversed term order.
  152%
  153% Easiest: map Key to Priority so that worse == smaller Priority (min-heap root).
  154% For asc: worse = larger Key, so Priority = key_rank(Key) where larger Key => smaller Priority
  155% We can use Priority = rev(Key) where rev/1 compares reverse via standard order.
  156
  157priority_for(asc(Key),  @=<, Key).
  158priority_for(desc(Key), @>=, Key).
  159
  160% Because root is WORST, a candidate is "better" if its priority is GREATER
  161% than worst priority (min-heap root means smallest = worst).
  162better_than(@>=, P1, P2) :- P1 @> P2.
  163better_than(@=<, P1, P2) :- P1 @< P2.
  164
  165setup_state(false, none).
  166setup_state(true,  state(DictHolder)) :-
  167    empty_assoc(D0),
  168    DictHolder = holder(D0).
  169
  170seen_hash(state(holder(D)), Hash, Key) :-
  171    get_assoc(Hash, D, Key).
  172
  173mark_hash(state(DictHolder), Hash, Key) :-
  174    DictHolder = holder(D0),
  175    put_assoc(Hash, D0, Key, D1),
  176    nb_setarg(1, DictHolder, D1),
  177    true.
  178
  179update_topk(Count, Pri, Key, Entry, HHolder) :-
  180    HHolder = holder(N0, H0),
  181    (   N0 < Count
  182    ->  add_to_heap(H0, Key, Entry, H1),
  183        N1 is N0 + 1,
  184        nb_setarg(1, HHolder, N1),
  185        nb_setarg(2, HHolder, H1)
  186    ;   % Heap full: compare with worst (root because root is "worst")
  187        replace_topk(_WorstKey, Pri, Key, Entry, HHolder)
  188    ).
  189
  190revdel_from_heap(Q0,Px,X,Q) :-
  191    get_from_heap(Q0,Py,Y,Q1),
  192    revdel_from_heap(Q1,Px,X,Q2),
  193    add_to_heap(Q2,Py,Y,Q),
  194    !.
  195revdel_from_heap(Q0,P,X,Q) :-
  196    get_from_heap(Q0,P,X,Q).
  197
  198pri_del_from_heap(Pri, H0, Key, Entry, HRest) :-
  199    (   var(Key),
  200        Pri == (@=<)
  201    ->  revdel_from_heap(H0, Key, Entry, HRest)
  202    ;   delete_from_heap(H0, Key, Entry, HRest)
  203    ).
  204
  205replace_topk(Key1, Pri, Key, Entry, HHolder) :-
  206    HHolder = holder(_, H0),
  207    pri_del_from_heap(Pri, H0, Key1, _, HRest),
  208    (   better_than(Pri, Key, Key1)
  209    ->  add_to_heap(HRest, Key, Entry, H1),
  210        nb_setarg(2, HHolder, H1)
  211    ;   fail
  212    ).
  213
  214heap_to_list(holder(_N, H), Pri, SortedKeyVars) :-
  215    heap_to_list(H, KV0),
  216    ( Pri == (@=<)
  217    ->% To avoid the reverse we need a max-heap, SWI-Prolog only provides a
  218      % min-heap implementation
  219      reverse(KV0, KV1)
  220    ; KV1 = KV0
  221    ),
  222    sort(1, Pri, KV1, SortedKeyVars).
  223
  224/* ---------- optimized execution (top-K per group) ---------- */
  225
  226run_optimized(Goal, Count, Pri, Key, Distinct, Witness, Group, Return, Result) :-
  227    term_variables(Goal, Vars),
  228    setup_state(Distinct, State),
  229    empty_assoc(G0),
  230    GHolder = holder(G0),  % maps GroupKey -> holder(N,Heap)
  231    term_variables(Witness, WVars),
  232    WTerm =.. [w|WVars],
  233    ( ground(Group)
  234    ->create_bucket(GHolder, Group, _)
  235    ; true
  236    ),
  237    forall(Goal,
  238           ignore(consider_solution(Count, Pri, Key, Distinct, WTerm, Group, Vars, State, GHolder))),
  239    finalize(Group, Return, Pri, GHolder, Vars, Goal, Result).
  240
  241consider_solution(Count, Pri, Key, Distinct, WTerm, Group, Vars, State, GHolder) :-
  242    Entry = Vars,
  243    ( Distinct == true
  244    ->variant_sha1(WTerm, Hash),
  245      ( seen_hash(State, Hash, Key1)
  246      ->get_or_create_bucket(GHolder, Group, Bucket),
  247        replace_topk(Key1, Pri, Key, Entry, Bucket),
  248        GHolder = holder(G0),
  249        put_assoc(Group, G0, Bucket, G1),
  250        nb_setarg(1, GHolder, G1),
  251        mark_hash(State, Hash, Key),
  252        fail
  253      ; true
  254      )
  255    ; true
  256    ),
  257    get_or_create_bucket(GHolder, Group, Bucket),
  258    update_topk(Count, Pri, Key, Entry, Bucket),
  259    GHolder = holder(G0),
  260    put_assoc(Group, G0, Bucket, G1),
  261    nb_setarg(1, GHolder, G1),
  262    ( Distinct == true
  263    ->mark_hash(State, Hash, Key)
  264    ; true
  265    ).
  266
  267get_or_create_bucket(GHolder, Group, Bucket) :-
  268    GHolder = holder(G0),
  269    (   get_assoc(Group, G0, Bucket)
  270    ->  true
  271    ;   create_bucket(GHolder, Group, Bucket)
  272    ).
  273
  274create_bucket(GHolder, Group, Bucket) :-
  275    GHolder = holder(G0),
  276    empty_heap(H0),
  277    Bucket = holder(0, H0),
  278    put_assoc(Group, G0, Bucket, G1),
  279    nb_setarg(1, GHolder, G1).
  280
  281finalize(Group, Return, Pri, holder(G), Vars, Goal, Result) :-
  282    gen_assoc(Group, G, Bucket),
  283    (Group == ungrouped -> ! ; true), % Minor optimization
  284    heap_to_list(Bucket, Pri, List),
  285    emit_result(Return, Vars, List, Goal, Result).
  286
  287emit_result(list(Term), Vars, List, _, Result) :- findall(Term, member(_Key-Vars, List), Result).
  288emit_result(backtrack, Vars, List, Goal, Goal) :- member(_Key-Vars, List)