:- lib(stoics_lib:kv_decompose/3).

bio_diffex_defaults( Args, Defs ) :-
    % this makes EvLog default: true
    ( (memberchk(exp_ev_log(EvLog),Args),EvLog==false) ->
        EvDefs = [  as_non(pvalue),
                    exp_ev_cnm('experiment'),
                    exp_ev_cut_let(-1),
                    exp_ev_cut_get(2),
                    exp_ev_include_inf(false),
                    exp_ev_log(false)
            ]
        ; 
        EvDefs = [  as_non(all),
                    exp_ev_cnm('log2FC'),
                    exp_ev_cut_let(-1),
                    exp_ev_cut_get(1),
                    exp_ev_include_inf(true),
                    exp_ev_log(true)
            ]
    ),
    Defs = [   
                as_pairs(true),
                de_max(false),
                de_mtx(_),
                mtx_dx_in(_),
                exp_pv_cnm('adj.pvalue'),
                exp_pv_cut(0.05),
                gene_id_cnm('Symbols'),
                which(dx(_,_))
                | EvDefs
    ].

/** bio_diffex( +Exp, -DEs, -NonDEs, +Opts ).

Select a sublist of significantly, differentially expressed genes in an experiment.

Exp is in the form of a mtx/1 matrix. Selected (DEs) and rejected (NonDEs) are returned in common format, 
of either (default) Gene-Ev, where Ev is the expression value for Gene in Exp, or sub-matrices of Exp.
The format is controlled by option =|as_pairs|=. 

We use ev as short for expression value, to avoid confusion with _exp_ which short for experiment.

Opts
  * as_non(AsNon=pvalue)
    what to return as non differentially expressed: either everything in Gcnm (AsNon=all, default when EvLog is true), or only those with numeric pvalue (AsNon=pvalue, default when EvLog is false)
  * as_pairs(AsPairs=true)
    whether to return pairs or matrices
  * de_max(DexMax=false)
    puts a cap on the number of differentially expressed genes returned
  * de_mtx(DexMtx)
    returns the selected matrix. if an atom, its taken to be a filename
  * exp_pv_cnm(ExpPcnm='adj.pvalue')
    the experimental column (found in MsF) on which Pcut is applied as a filter
  * exp_pv_cut(Pcut=0.05)
    p.value cut off for experimental input from MSstats
  * exp_ev_log(EvLog=true)
    are the expression values log values 
  * exp_ev_cnm(EvCnm)
    expession value column name (_log2FC_ if EvLog=true, _expression_ otherwise)
  * exp_ev_cut_let(EvLet= -1)
    expression values below (less or equal) which values are selected
  * exp_ev_cut_get(EvGet)
    expression values above (greater or equal) which values are selected (defaults is 2 if EvLog is false, and 1 otherwise)
  * exp_ev_include_inf(IncInf=false)
    include infinity values as diffexs ? (default is _false_ if EvLog is false, and _true_ otherwise)
  * gene_id_cnm(Gcnm='Symbols')
    column name for the key value in the pair lists: DEs and NonDEs.
  * mtx_dx_in(Mtx)
    can be used to return the Mtx used (from mtx(Exp,Mtx) call)
  * which(Wch=dx(UpIs,DownIs))
    returns the up and down-regulated indices

==
?- lib(mtx),
   absolute_file_name(pack('bio_analytics/data/silac/bt.csv'), CsvF),
   mtx(CsvF, Mtx, convert(true)),
   assert(mtx_data(Mtx)).

CsvF = '/usr/local/users/nicos/local/git/lib/swipl/pack/bio_analytics/data/silac/bt.csv',
Mtx = [row('Protein IDs', 'Symbols', log2FC, adj.pvalue), row('Q9P126', ...), row(..., ..., ..., ...)|...].

?- lib(debug_call),
   debug(testo),
   mtx_data(Mtx),
   debug_call(testo, dims, mtx/Mtx),
   bio_diffex(Mtx, DEPrs, NonDEPrs, []),
   debug_call(testo, length, de/DEPrs),
   debug_call(testo, length, nde/NonDEPrs).

% Dimensions for matrix,  (mtx) nR: 1245, nC: 4.
% Length for list, de: 426.
% Length for list, nde: 818.

DEPrs = ['CLEC1B'- -4.80614893171469, 'LGALS1'- -2.065877096524, ... - ...|...],
NonDEPrs = ['CNN2'- -0.69348026828097, 'CXCR4'-0.73039667221395, ... - ...|...].

?- 
    mtx_data(Mtx),
    bio_diffex(Mtx, DEPrs, NonDEPrs, exp_pv_cut(0.01)),
    debug_call(testo, length, de/DEPrs),
    debug_call(testo, length, nde/NonDEPrs).

% Length for list, de: 286.
% Length for list, nde: 958.

?-
    mtx_data(Mtx),
    Opts = [exp_ev_cut_let(inf),exp_ev_cut_get(-inf)],
    bio_diffex(Mtx, DEPrs, NonDEPrs, Opts),
    debug_call(testo, length, de/DEPrs),
    debug_call(testo, length, nde/NonDEPrs).

% Length for list, de: 581.
% Length for list, nde: 663.

?- mtx_data(Mtx),
    Opts = [exp_ev_cut_let(inf),exp_ev_cut_get(-inf),as_pairs(false)],
    bio_diffex(Mtx, DEs, NonDEs, Opts),
    debug_call(testo, length, de/DEPrs),
    debug_call(testo, length, nde/NonDEPrs).

% Length for list, de: 582.
% Length for list, nde: 664.

Mtx = [row('Protein IDs', 'Symbols', log2FC, adj.pvalue), row(..., ..., ..., ...)|...],
Opts = [exp_ev_cut_let(inf), exp_ev_cut_get(-inf), as_pairs(false)],
DEs = [row('Protein IDs', 'Symbols', log2FC, adj.pvalue), row('Q9P126', 'CLEC1B', -4.80614893171469, 2.057e-37), row(..., ..., ..., ...)|...],
NonDEs = [row('Protein IDs', 'Symbols', log2FC, adj.pvalue), row('B4DUT8;Q6FHC3;Q6FHE4;Q99439;B4DDF4;Q53GK7;B4DN57;B4DHU5;K7ES69;H3BVI6;H3BQH0', 'CNN2', -0.69348026828097, 0.0814675), row(..., ..., ..., ...)|...].

==

@author nicos angelopoulos
@version  0.1 2019/5/2
@version  0.2 2020/9/3,  ability to return sub-matrices (9/14): diffex_mtx()
@version  0.3 2022/12/16, changed name from exp_diffex/4 to bio_diffex/4.

*/

bio_diffex( MtxIn, DEs, NonDEs, Args ) :-
    Self = bio_diffex,
    options_append( Self, Args, Opts ),
    options( as_non(AsNon), Opts ),
    options( as_pairs(AsPrs), Opts ),
    options( exp_pv_cnm(ExpPCnm), Opts ),
    options( exp_pv_cut(Pcof), Opts ),
    options( exp_ev_cnm(ExpEvCnm), Opts ),
    options( exp_ev_cut_let(EvLet), Opts ),
    options( exp_ev_cut_get(EvGet), Opts ),
    options( exp_ev_include_inf(InfInc), Opts ),
    options( gene_id_cnm(Gcnm), Opts ),
    options( de_max(DEMaxPrv), Opts ),
    options( de_mtx(DEMtx), Opts ),
    mtx( MtxIn, Mtx, convert(true) ),
    Mtx = [Hdr|Rows],
    Cids = [ExpPCnm,ExpEvCnm,Gcnm],
    CPos = [PvPos,EvPos,GnPos],
    maplist( mtx_header_column_name_pos(Hdr), Cids, _Cnms, CPos ),
    options( de_max(DEMaxPrv), Opts ),
    ( number(DEMaxPrv) -> DEMaxPrv = DEMax; length(Rows,RsLen), DEMax is (RsLen + 2) * 2 ),  % ideally we want infinity ...
    bio_diffex_separate( Rows, DEMax, 1, PvPos, EvPos, GnPos, Pcof, EvLet, EvGet, InfInc, AsNon, AsPrs, TDEs, TNonDEs, Iu, Id, DERows ),
    options( which(dx(Iu,Id)), Opts ),
    exp_diff_add_header( AsPrs, Hdr, TDEs, TNonDEs, DEs, NonDEs ),
    mtx( DEMtx, [Hdr|DERows] ),
    options( mtx_dx_in(Mtx), Opts ).

bio_diffex_separate( [], _X, _I, _PvPos, _EvPos, _GnPos, _Pcof, _EvLet, _EvGet, _InfInc, _AsNon, _AsPrs, [], [], [], [], [] ).
bio_diffex_separate( [Row|Rows], X, I, PvPos, EvPos, GnPos, Pcof, EvLet, EvGet, InfInc, AsNon, AsPrs, DEs, NonDEs, Iu, Id, DERows ) :-
    arg( PvPos, Row, Pv ), 
    arg( EvPos, Row, Ev ),
    arg( GnPos, Row, Gn ),
    ( (X>0,bio_diffex_select(Pv,Ev,Pcof,EvLet,EvGet,Dir,InfInc)) ->
        ( Dir == up -> Iu = [I|IuT], IdT = Id; IuT = Iu, Id = [I|IdT] ),
        bio_diffex_ret_elem( AsPrs, Row, Gn, Ev, DEs, TDEs ),
        Y is X - 1,
        NonDEs = TNonDEs,
        DERows = [Row|TDERows]
        ;
        IuT = Iu,
        IdT = Id,
        Y is X,
        DEs = TDEs,
        ( (AsNon == pvalue, \+ number(Pv)) -> NonDEs = TNonDEs; bio_diffex_ret_elem(AsPrs,Row,Gn,Ev,NonDEs,TNonDEs)
                % NonDEs = [Gn-Ev|TNonDEs] 
        ),
        DERows = TDERows
    ),
    J is I + 1,
    bio_diffex_separate( Rows, Y, J, PvPos, EvPos, GnPos, Pcof, EvLet, EvGet, InfInc, AsNon, AsPrs, TDEs, TNonDEs, IuT, IdT, TDERows ).

exp_diff_add_header( false, Hdr, TDEs, TNonDEs, DEs, NonDEs ) :-
    !,
    DEs = [Hdr|TDEs],
    NonDEs = [Hdr|TNonDEs].
exp_diff_add_header( _Defaulty, _Hdr, TDEs, TNonDEs, DEs, NonDEs ) :-
    DEs = TDEs,
    NonDEs = TNonDEs.

bio_diffex_ret_elem( false, Row, _Gn, _Ev, List, Tail ) :-
    !,
    List = [Row|Tail].
bio_diffex_ret_elem( _Defaulty, _Row, Gn, Ev, List, Tail ) :-
    % DEs = [Gn-Ev|TDEs],
    List = [Gn-Ev|Tail].

bio_diffex_select( Pv, Ev, Pcof, EvLet, EvGet, Dir, InfInc ) :-
    number( Pv ), 
    Pv < Pcof,
    number(Ev),
    ( Ev =< EvLet ->
                    Dir = down
                    ; 
                    EvGet =< Ev,
                    Dir = up
    ),
    bio_diffex_if_inf_include( Ev, InfInc ).

bio_diffex_if_inf_include( Val, Incl ) :-
    ( Val is inf; Val is - inf ),
    !,
    Incl == true.
bio_diffex_if_inf_include( _Val, _Incl ).