#!/usr/bin/env perl
#
#  The MIT License
#
#  Copyright (c) 2024-2025 Genome Research Ltd.
#
#  Author: petr.danecek@sanger
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in
#  all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#  THE SOFTWARE.


use strict;
use warnings;
use Carp;

my $opts = parse_params();
parse_and_calc($opts);

exit;

#--------------------------------

sub error
{
    my (@msg) = @_;
    if ( scalar @msg ) { confess @msg; }
    print
        "About: Parse bcftools/vrfs output and calculate variances from a subset of automatically selected\n",
        "       reference sites\n",
        "Usage: zcat scores.txt.gz | vrfs-variances [OPTIONS]\n",
        "Options:\n",
        "   -n, --ndat NUM                  Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n",
        "   -r, --rand-noise SEED[,RATE]    Add random noise, 0 for random seed [0,1e-3]\n",
        "   -s, --list-sites                List sites passing the -n setting\n",
        "   -S, --sort-func FUNC            Reference site selection is based on the ordering defined by FUNC [nalt]\n",
        "                                       nalt .. sort by the overall number of alternate reads\n",
        "                                       vaf  .. sort by the big-VAF bins being most significant first\n",
        "   -v, --list-var2                 Output in a format suitable for `bcftools +vrfs -r file`\n",
        "   -h, -?, --help                  This help message\n",
        "\n";
    exit -1;
}
sub parse_params
{
    my $opts =
    {
        ndat => 0.2,
        sort_func => \&cmp_dist_nalt,
    };
    if ( -t STDIN && !@ARGV ) { error(); }
    while (defined(my $arg=shift(@ARGV)))
    {
        if ( $arg eq '-r' or $arg eq '--rand-noise' )
        {
            my ($seed,$rate) = split(/,/,shift(@ARGV));
            $$opts{rand_seed} = $seed;
            $$opts{rand_rate} = defined $rate ? $rate : 1e-3;
            next;
        }
        if ( $arg eq '-s' or $arg eq '--list-sites' ) { $$opts{list_sites}=1; next }
        if ( $arg eq '-S' or $arg eq '--sort-func' )
        {
            my $func = shift(@ARGV);
            if ( $func eq 'nalt' ) { $$opts{sort_func} = \&cmp_dist_nalt; }
            elsif ( $func eq 'vaf' ) { $$opts{sort_func} = \&cmp_dist_max_vaf; }
            else { error("Error: the sort function \"$func\" is not supported\n"); }
            next;
        }
        if ( $arg eq '-v' or $arg eq '--list-var2' ) { $$opts{list_var2}=1; next }
        if ( $arg eq '-n' or $arg eq '--ndat' ) { $$opts{ndat}=shift(@ARGV); next }
        if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); }
        error("Unknown parameter \"$arg\". Run -h for help.\n");
    }
    if ( exists($$opts{rand_seed}) )
    {
        if ( $$opts{rand_seed} ) { srand($$opts{rand_seed}); }
        else { srand(); }
    }
    return $opts;
}

sub cmp_dist_max_vaf
{
    for (my $i=@{$$a{dist}}-1; $i>=0; $i--)
    {
        if ( $$a{dist}[$i] == $$b{dist}[$i] ) { next; }
        return $$a{dist}[$i] <=> $$b{dist}[$i];
    }
    return 0;
}

sub cmp_dist_nalt
{
    my ($sa,$sb,$na,$nb);
    for (my $i=0; $i<@{$$a{dist}}; $i++)
    {
        # sa,sb .. normalize to the same number of samples the site had data for
        $sa += $$a{dist}[$i];
        $sb += $$b{dist}[$i];

        # na,nb .. the number of alternate reads across all samples
        $na += $$a{dist}[$i]*$i;
        $nb += $$b{dist}[$i]*$i;
    }
    $na /= $sa;
    $nb /= $sb;
    return $na<=>$nb;
}

sub parse_and_calc
{
    my ($opts) = @_;
    my $sort_func = $$opts{sort_func};
    my @dat = ();
    while (my $line=<STDIN>)
    {
        # SITE    chr15   79031596    AAG A   5.746144e+01    926-181-22-12-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0
        if ( !($line=~/^SITE/) ) { next; }
        chomp($line);
        my (@col) = split(/\t/,$line);
        my @dist = split(/-/,$col[-1]);
        push @dat, { line=>$line, dist=>\@dist };
    }
    my @sdat = sort $sort_func @dat;
    my $ndat = $$opts{ndat};
    if ( $ndat <= 1 ) { $ndat = int($ndat * scalar @sdat); }
    my $n = 0;
    my @avg  = ();
    my @avg2 = ();
    for my $x (@sdat)
    {
        my $max = 0;
        for (my $i=0; $i<@{$$x{dist}}; $i++)
        {
            # Add random noise in a very simplistic way: optionally increment one or more VAF bins
            if ( $$opts{rand_seed} && rand(1./$$opts{rand_rate})<=1 ) { $$x{dist}[$i]++; }
            if ( $max < $$x{dist}[$i] ) { $max = $$x{dist}[$i]; }
        }
        for (my $i=0; $i<@{$$x{dist}}; $i++)
        {
            my $val = $$x{dist}[$i] / $max;
            $avg[$i]  += $val;
            $avg2[$i] += $val * $val;
        }
        if ( $$opts{list_sites} ) { print $$x{line}."\n"; }
        if ( ++$n >= $ndat ) { last; }
    }
    if ( $$opts{list_sites} ) { return; }
    $avg2[0] = 1;
    for (my $i=0; $i<@avg; $i++)
    {
        $avg[$i]  = sprintf("%e",$avg[$i]/$n);
        $avg2[$i] = $avg2[$i]/$n - $avg[$i]*$avg[$i];
        if ( $avg2[$i]<=0 )
        {
            # yes, it can be smaller than zero as well, machine precision is in play when the values are close to zero
            $avg2[$i] = $i>0 ? $avg2[$i-1]/2 : 1;
        }
        if ( !exists($$opts{rand_noise}) && $avg2[$i] < 1e-9 )
        {
            $avg2[$i] = $avg2[$i-1] * ($i-1) / $i;
        }
        $avg2[$i] = sprintf("%e",$avg2[$i]);
    }
    # make it monotonic
    for (my $i=@avg2-1; $i>0; $i--)
    {
        if ( $avg2[$i-1] < $avg2[$i] ) { $avg2[$i-1] = $avg2[$i]; }
    }
    if ( $$opts{list_var2} )
    {
        print join("\n",@avg2)."\n";
    }
    else
    {
        print STDERR "MEAN\t".join(" ",@avg)."\n";
        print STDERR "VAR2\t".join(" ",@avg2)."\n";
    }
}

