:- use_module(library(apply)).
:- use_module(library(lists)).
:- use_module(library(debug)).

:- use_module(library(lib)).

:- lib(bio_db).
:- lib(bio_db_repo).
:- lib(by_unix).

:- lib(debug).
:- debug(bio_db_stats).
% :- debug(bio_db).

/**  bio_db_stats.
     bio_db_stats(Opts).

Writes out a number of measures for the datasets served by bio_db.
You should have installed bio_db_repo, if you want to run this.

In the interest of limiting required memory, the pred assumes you
are using the Prolog interface of bio_db (see bio_db_interface/2),
this predicate abolishes the interrogated bio_db data predicates 
once it has counted its contents.  

Opts
  * abolish(Abl=true)
    use false to turn abolish off. This is no longer needed (v0.3)
  * csv(Csv=false)
    requires pack(mtx),
    use _true_ to save as csv to bio_db_stats.csv, _version_ for depositing
    a bio_db_repo installed versioned csv in bio_db_scirpts/ directory,
    or give atom of file name to use

As of bio_db_version(3:6:0,_) the directory scripts/bio_db_stats/ holds the csv output of this script for each 
new pack(bio_db_repo) version, produced with option csv(version).

As of version 0.3 you need pack(by_unix)- this avoids over-loading the memory.

==
?- bio_db_stats.
% cgnc_galg_cgnc_curs/2 has 25925 records.
% cgnc_galg_cgnc_edat/2 has 25925 records.
% cgnc_galg_cgnc_ensg/2 has 15253 records.
% cgnc_galg_cgnc_name/2 has 25925 records.
% cgnc_galg_cgnc_ncbi/2 has 25921 records.
% cgnc_galg_cgnc_symb/2 has 25925 records.
% cgnc_galg_cgnc_syno/2 has 3030 records.
% ense_galg_ensg_chrl/5 has 30108 records.
% ense_galg_ensg_symb/2 has 14249 records.
% ense_galg_enst_chrl/5 has 72689 records.
% ense_galg_enst_ensg/2 has 72689 records.
% ense_gg6a_ensg_chrl/5 has 30862 records.
% ense_gg6a_ensg_symb/2 has 6480 records.
% ense_gg6a_enst_chrl/5 has 74296 records.
% ense_gg6a_enst_ensg/2 has 74296 records.
% ense_homs_ensg_chrl/5 has 62710 records.
% ense_homs_ensg_hgnc/2 has 41287 records.
% ense_homs_ensg_symb/2 has 41287 records.
% ense_homs_enst_chrl/5 has 252702 records.
% ense_homs_enst_ensg/2 has 252702 records.
% ense_musm_ensg_chrl/5 has 57010 records.
% ense_musm_ensg_mgim/2 has 55398 records.
% ense_musm_ensg_symb/2 has 57010 records.
% ense_musm_enst_chrl/5 has 149347 records.
% ense_musm_enst_ensg/2 has 149347 records.
% ense_suss_ensg_chrl/5 has 35670 records.
% ense_suss_ensg_symb/2 has 17511 records.
% ense_suss_enst_chrl/5 has 60273 records.
% ense_suss_enst_ensg/2 has 60273 records.
% gont_galg_symb_gont/4 has 117139 records.
% gont_homs_edge_gisa/2 has 68650 records.
% gont_homs_edge_gnrg/2 has 2951 records.
% gont_homs_edge_gpof/2 has 7696 records.
% gont_homs_edge_gprg/2 has 2945 records.
% gont_homs_edge_greg/2 has 3396 records.
% gont_homs_gont_gonm/2 has 42950 records.
% gont_homs_gont_symb/3 has 336023 records.
% gont_homs_symb_gont/3 has 336023 records.
% gont_musm_gont_symb/4 has 437854 records.
% gont_musm_mgim_gont/4 has 437865 records.
% gont_suss_symb_gont/4 has 125832 records.
% hgnc_homs_ccds_hgnc/2 has 18926 records.
% hgnc_homs_ensg_hgnc/2 has 41001 records.
% hgnc_homs_hgnc_ccds/2 has 18926 records.
% hgnc_homs_hgnc_chrb/2 has 43699 records.
% hgnc_homs_hgnc_ensg/2 has 41001 records.
% hgnc_homs_hgnc_name/2 has 43700 records.
% hgnc_homs_hgnc_ncbi/2 has 43654 records.
% hgnc_homs_hgnc_symb/2 has 43700 records.
% hgnc_homs_ncbi_hgnc/2 has 43654 records.
% hgnc_homs_ncbi_symb/2 has 43654 records.
% hgnc_homs_prev_symb/2 has 15516 records.
% hgnc_homs_symb_hgnc/2 has 43700 records.
% hgnc_homs_symb_ncbi/2 has 43654 records.
% hgnc_homs_syno_symb/2 has 43719 records.
% mgim_musm_mgim_chrl/5 has 676449 records.
% mgim_musm_mgim_genb/2 has 277243 records.
% mgim_musm_mgim_ncbi/2 has 719140 records.
% mgim_musm_mgim_symb/2 has 676449 records.
% mgim_musm_mgim_unip/2 has 16988 records.
% mgim_musm_symb_wdra/2 has 59005 records.
% mgim_musm_syno_mgim/2 has 257027 records.
% ncbi_homs_dnuc_symb/2 has 869087 records.
% ncbi_homs_ensg_ncbi/2 has 36278 records.
% ncbi_homs_ensp_ncbi/2 has 46300 records.
% ncbi_homs_ncbi_ensg/2 has 36278 records.
% ncbi_homs_ncbi_ensp/2 has 46300 records.
% ncbi_homs_rnuc_symb/2 has 466669 records.
% ncbi_musm_syno_symb/2 has 70348 records.
% ncbi_suss_ensg_ncbi/2 has 17782 records.
% ncbi_suss_ensp_ncbi/2 has 23600 records.
% ncbi_suss_ncbi_ensg/2 has 17782 records.
% ncbi_suss_ncbi_ensp/2 has 23600 records.
% pros_homs_pros_prsn/2 has 1851 records.
% pros_homs_pros_sprt/7 has 54556 records.
% strg_galg_edge_ensp/3 has 7821418 records.
% strg_galg_edge_symb/3 has 3910709 records.
% strg_galg_ensp_symb/2 has 15508 records.
% strg_homs_edge_ensp/3 has 11938498 records.
% strg_homs_edge_symb/3 has 5066306 records.
% strg_musm_edge_ensp/3 has 14496358 records.
% strg_musm_edge_symb/3 has 6258522 records.
% strg_suss_edge_ensp/3 has 13781164 records.
% strg_suss_edge_symb/3 has 6890582 records.
% strg_suss_ensp_symb/2 has 21597 records.
% unip_galg_unip_ensp/2 has 45911 records.
% unip_galg_unip_gyno/2 has 1534 records.
% unip_galg_unip_ncbi/2 has 9055 records.
% unip_galg_unip_strp/2 has 3054 records.
% unip_galg_unip_symb/2 has 45484 records.
% unip_homs_ensp_unip/2 has 120717 records.
% unip_homs_hgnc_unip/2 has 81725 records.
% unip_homs_sprt_seqn/2 has 20422 records.
% unip_homs_trem_nucs/2 has 1038884 records.
% unip_homs_trem_seqn/2 has 187358 records.
% unip_homs_unip_hgnc/2 has 81725 records.
% unip_homs_unip_ncbi/2 has 33144 records.
% unip_musm_ensp_unip/2 has 67162 records.
% unip_musm_gyno_unip/2 has 28951 records.
% unip_musm_mgim_unip/2 has 80434 records.
% unip_musm_trem_nucs/2 has 129691 records.
% unip_musm_unip_ncbi/2 has 32442 records.
% unip_musm_unip_symb/2 has 83163 records.
% Total number of predicates: 103, and records: 80948178

% You should better halt this session as bio_db predicates were retracted.
true.

?- bio_db_stats( [abolish(false),csv('/tmp/bio_db_stats.csv')] ).

% bio_db version: 3:4:0, date: date(2021,5,10)
% bio_db_repo version: 21:12:3, date: date(2021,12,3)
% 
% edge_gont_is_a/2 has 70755 records.
....

?- shell('head -2 /tmp/bio_db_stats.csv').
edge_gont_is_a,2,70755
edge_gont_negatively_regulates,2,3089
...
==

@author nicos angelopoulos
@version  0.1 2018/11/23
@version  0.2 2021/12/03,  options, sort predicates
@version  0.3 2023/06/21,  use pack(by_unix) to use external call-avoid memory problems

*/
bio_db_stats :-
    bio_db_stats( [] ).

bio_db_stats( Args ) :-
    Defs = [csv(false),abolish(true)],
    ( is_list(Args) -> append(Args,Defs,Opts)
                     ; append([Args],Defs,Opts) ),
    bio_db_version( Vers, Date ),
    debug( bio_db_stats, 'bio_db version: ~w, date: ~w', [Vers,Date] ),
    bio_db_repo_version( RpVers, RpDate ),
    debug( bio_db_stats, 'bio_db_repo version: ~w, date: ~w\n', [RpVers,RpDate] ),
    memberchk( abolish(Abl), Opts ),
    bio_db_stats_trips( Abl, Trips, NoPreds, NoRecs ),
    memberchk( csv(Csv), Opts ),
    bio_db_stats_csv( Csv, Trips, NoPreds, NoRecs ),
    ( Abl == true ->
        debug( bio_db_stats, 'You should better halt this session as bio_db predicates were retracted.', [] )
        ;
        true
    ).

bio_db_stats_csv( false, _Trips, _NoPreds, _NoRecs ) :-
    !.
bio_db_stats_csv( true, Trips, NoPreds, NoRecs ) :-
    !,
    bio_db_stats_csv( 'bio_db_stats.csv', Trips, NoPreds, NoRecs ).
bio_db_stats_csv( version, Trips, NoPreds, NoRecs ) :-
    !,
    bio_db_repo_version( RpVers, _RpDate ),
    RpVers = Mj:Mn:Fx,
    atomic_list_concat( ['bio_db_stats/bio_db_stats-',Mj,'.',Mn,'.',Fx,'.csv'], '', CsvF ),
    bio_db_stats_csv( CsvF, Trips, NoPreds, NoRecs ).
bio_db_stats_csv( CsvF, Trips, NoPreds, NoRecs ) :-
    lib( mtx ),
    findall( row(Pn,Pa,Len), member(Pn/Pa-Len,Trips), Rows ),
    debug( bio_db_stats, 'Writing to csv output to file: ~p', [CsvF] ),
    bio_db_version( Vers, Date ),
    Vers = Mj:Mn:Fx,
    Date = date(Yr,Mo,Da),
    atomic_list_concat( [Mj,Mn,Fx], ':', BioDbVersTkn ),
    atomic_list_concat( [Yr,Mo,Da], '.', BioDbDateTkn ),
    bio_db_repo_version( RVers, RDate ),
    RVers = RMj:RMn:RFx,
    RDate = date(RYr,RMo,RDa),
    atomic_list_concat( [RMj,RMn,RFx], ':', RBioDbVersTkn ),
    atomic_list_concat( [RYr,RMo,RDa], '.', RBioDbDateTkn ),
    Hdr1 = row(bio_db_version,BioDbVersTkn,BioDbDateTkn ),
    Hdr2 = row(bio_db_repo_version,RBioDbVersTkn,RBioDbDateTkn),
    Hdr3 = row(total_tables_records,NoPreds,NoRecs),
    mtx( CsvF, [Hdr1,Hdr2,Hdr3|Rows] ).

bio_db_stats_trips( Abl, Trips, NoPreds, NoRecs ) :-
    findall( Pn/Pa, bio_db_data_predicate(Pn,Pa), PnPas ),
    sort( PnPas, OrdPnPas ),
    % debuc( bio_db_stats, enum, ord_pn_pas/OrdPnPas ),
    bio_db_preds_stats( OrdPnPas, Abl, Trips ),
    findall( Len, member(_-Len,Trips), Lens ),
    sum_list( Lens, NoRecs ),
    % findall( Pn/Pa-Len, bio_db_pred_stats(Pn,Pa,Abl,Len), Trips ),
    length( Trips, NoPreds ),
    debug( bio_db_stats, 'Total number of predicates: ~d, and records: ~d\n', [NoPreds,NoRecs] ).

bio_db_data_predicate( Pn, Pa ) :-
    current_predicate( bio_db:Pn/Pa ),
    % member( Pn/Pa, OrdPnPas ),
    % member( Pfx, [map_,edge_] ),
    % atom_concat( Pfx, _, Pn ),
    atomic_list_concat( [Db,Org,Obj1,Obj2], '_', Pn ),
    maplist( atom_length, [Db,Org,Obj1,Obj2], [4,4,4,4] ).


bio_db_preds_stats( [], _Abl, [] ).
bio_db_preds_stats( [Pn/Pa|T], Abl, [Pn/Pa-Len|Lens] ) :-
    % \+ atom_concat( _, info, Pn ),  % currently map_mgim_mouse_syno_mgim_info/2 is tried...
    functor( G, Pn, Pa ),
    % debug( bio_db_stats, 'doing: ~w', [Pn/Pa] ),
    % once( \+ \+ G ),
    % once( predicate_property(bio_db:G,number_of_clauses(Len)) ),
    term_to_atom( G, Gatm ),
    atomic_list_concat( ['findall(1,',Gatm,',Ones),length(Ones,Len),write(Len),nl,halt.'], Findall ),
    Got @@ swipl( -q, -g, 'lib(bio_db)', -g, Findall ),
    last( Got, Last ),
    % Got = [LenAtm],
    atom_number( Last, Len ),
    number( Len ),
    % findall( 1, G, Ones ),
    % length( Ones, Len ),
    debug( bio_db_stats, '~w/~d has ~d records.', [Pn,Pa,Len] ),
    % this is no longer relevant
    % bio_db_stats_abolish( Abl, Pn, Pa ),
    garbage_collect,
    garbage_collect_atoms,
    garbage_collect_clauses,
    trim_stacks,
    trim_heap,
    bio_db_preds_stats( T, Abl, Lens ).

bio_db_stats_abolish( false, _Pn, _Pa ) :-
    !.
bio_db_stats_abolish( true, Pn, Pa ) :-
    % abolish( bio_db:Pn/Pa ).
    retractall( bio_db:Pn/Pa ).