View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2011-2021, VU University Amsterdam
    7                              SWI-Prolog Soutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(isub,
   37          [ isub/4,              % +Text1, +Text2, -Distance, +Options
   38            '$isub'/5            % +Text1, +Text2, -Distance, +Flags, +Threshold
   39          ]).   40:- autoload(library(option), [option/3]).   41
   42:- use_foreign_library(foreign(isub)).   43
   44/** <module> isub: a string similarity measure
   45
   46The library(isub) implements a similarity measure between strings, i.e.,
   47something similar to the _|Levenshtein distance|_.  This method is based
   48on the length of common substrings.
   49
   50@author Giorgos Stoilos
   51@see    _|A string metric for ontology alignment|_ by Giorgos Stoilos,
   52        2005 - http://www.image.ece.ntua.gr/papers/378.pdf .
   53*/
   54
   55%!  isub(+Text1:text, +Text2:text,
   56%!       -Similarity:float, +Options:list ) is det.
   57%
   58%   Similarity is a measure  of   the  similarity/dissimilarity  between
   59%   Text1 and Text2. E.g.
   60%
   61%     ```
   62%     ?- isub('E56.Language', 'languange', D, [normalize(true)]).
   63%     D = 0.4226950354609929.                       % [-1,1] range
   64%
   65%     ?- isub('E56.Language', 'languange', D, [normalize(true),zero_to_one(true)]).
   66%     D = 0.7113475177304964.                       % [0,1] range
   67%
   68%     ?- isub('E56.Language', 'languange', D, []).  % without normalization
   69%     D = 0.19047619047619047.                      % [-1,1] range
   70%
   71%     ?- isub(aa, aa, D, []).  % does not work for short substrings
   72%     D = -0.8.
   73%
   74%     ?- isub(aa, aa, D, [substring_threshold(0)]). % works with short substrings
   75%     D = 1.0.                                      % but may give unwanted values
   76%                                                   % between e.g. 'store' and 'spore'.
   77%
   78%     ?- isub(joe, hoe, D, [substring_threshold(0)]).
   79%     D = 0.5315315315315314.
   80%
   81%     ?- isub(joe, hoe, D, []).
   82%     D = -1.0.
   83%     ```
   84%
   85%   This is a new version of isub/4 which replaces the old version while
   86%   providing backwards compatibility. This new   version allows several
   87%   options to tweak the algorithm.
   88%
   89%   @arg Text1 and Text2 are either an atom, string or a list of
   90%   characters or character codes.
   91%   @arg Similarity is a float in the range [-1,1.0], where 1.0
   92%   means _|most similar|_. The range can be set to [0,1] with
   93%   the zero_to_one option described below.
   94%   @arg Options is a list with elements described below. Please
   95%   note that the options are processed at compile time using
   96%   goal_expansion to provide much better speed. Supported options
   97%   are:
   98%
   99%   - normalize(+Boolean)
  100%   Applies string normalization as implemented by the original
  101%   authors: Text1  and Text2 are mapped
  102%   to lowercase and the characters  "._   "  are removed. Lowercase
  103%   mapping is done  with  the   C-library  function  towlower(). In
  104%   general, the required normalization is   domain dependent and is
  105%   better left to the caller.  See e.g., unaccent_atom/2. The default
  106%   is to skip normalization (`false`).
  107%
  108%   - zero_to_one(+Boolean)
  109%   The old isub implementation deviated from the original algorithm
  110%   by returning a value in the [0,1] range. This new isub/4 implementation
  111%   defaults to the original range of [-1,1], but this option can be set
  112%   to `true` to set the output range to [0,1].
  113%
  114%   - substring_threshold(+Nonneg)
  115%   The original algorithm was meant to compare terms in semantic web
  116%   ontologies, and it had a hard coded parameter that only considered
  117%   substring similarities greater than 2 characters. This caused the
  118%   similarity between, for example 'aa' and 'aa' to return -0.8 which
  119%   is not expected. This option allows the user to set any threshold,
  120%   such as 0, so that the similatiry between short substrings can be
  121%   properly recognized. The default value is 2 which is what the
  122%   original algorithm used.
  123
  124isub(T1, T2, Normalize, Similarity) :-
  125   (   Normalize == true
  126   ->  !, '$isub'(T1,T2,Similarity,0x3,2)
  127   ;   Normalize == false
  128   ->  !, '$isub'(T1,T2,Similarity,0x1,2)
  129   ).
  130isub(T1, T2, Similarity, Options) :-
  131   isub_options(NumOpts,SubstringThreshold, Options),
  132   '$isub'(T1,T2,Similarity,NumOpts,SubstringThreshold).
  133
  134isub_options(NumOpts,SubstringThreshold, Options) :-
  135   option(normalize(Normalize), Options, false),
  136   option(zero_to_one(ZeroToOne), Options, false),
  137   option(substring_threshold(SubstringThreshold), Options, 2),
  138   normalize_int(Normalize,NInt),
  139   zero_one_range_int(ZeroToOne,ZInt),
  140   NumOpts is NInt \/ ZInt.
  141
  142normalize_int(true,0x2).
  143normalize_int(false,0x0).
  144
  145zero_one_range_int(true,0x1).
  146zero_one_range_int(false,0x0).
  147
  148user:goal_expansion(isub(T1,T2,Normalize,D),
  149                    '$isub'(T1,T2,D,NumOpts,SubstringThreshold)) :-
  150   (   Normalize == true
  151   ->  NumOpts = 0x3, SubstringThreshold = 2
  152   ;   Normalize == true
  153   ->  NumOpts = 0x1, SubstringThreshold = 2
  154   ).
  155user:goal_expansion(isub(T1,T2,D,Options),
  156                    '$isub'(T1,T2,D,NumOpts,SubstringThreshold)) :-
  157   isub_options(NumOpts,SubstringThreshold, Options).
  158
  159:- multifile sandbox:safe_primitive/1.  160
  161sandbox:safe_primitive(isub:isub(_,_,_,_)).
  162sandbox:safe_primitive(isub:'$isub'(_,_,_,_,_))