The following awk one-liner reads the PHYLIP-formatted square distance matrix file $infile
and writes the symmetrized version into $outfile
($prec
is the number of decimal places):
awk -v p=$prec '(NR>1){m=(m>(l=length(lbl[++n]=$(c=j=1))))?m:l;--j;while(++c<=NF)d[n,(++j)]=$c}
END{i=n;++i;while((j=--i)>0)while(--j>0)d[i,j]=d[j,i]=(d[i,j]+d[j,i])/2
print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;
while(++i<=n){printf substr(lbl[i]b,1,m);j=0;while(++j<=n)printf(" %."p"f",d[i,j]);print""}}' $infile > $outfile
The following gawk one-liner allows the arboricity coefficient (Guénoche and Garreta 2001; [pdf]) to be estimated from the PHYLIP-formatted distance matrix file $infile
(either square or lower-triangular). The arboricity coefficient assesses the overall treelikeness of the evolutionary distances, i.e. they share a strong phylogenetic signal when the arboricity coefficient is close to 1.
gawk '(NR>1){++n;j=0;i=1;while(++j<n)d[n][j]=$(++i)}
END{u=n;++u;while((v=--u)>0){while((x=--v)>0){duv=d[u][v];while((y=--x)>0){dux=d[u][x];dvx=d[v][x];while(--y>0)
((a=duv+d[x][y])+(b=dux+d[v][y])+(c=dvx+d[u][y])<=3*(a>b?(c>a?a:(b>c?b:c)):(c>b?b:(a>c?a:c))))&&++arb}}} print 24*arb/(n*(--n)*(--n)*(--n))}' $infile
A δ plot (Holland et al. 2002) displays a measure for treelikeness of quartets in terms of a histogram with $nb
intervals. The following gawk one-liner allows the δ plot to be estimated from the PHYLIP-formatted distance matrix file $infile
(either square or lower-triangular):
gawk -v nb=$nb '(NR>1){u=(++n);j=0;i=1;while(++j<u)d[u][j]=$(++i);++u}
END{while((v=--u)>0){while((x=--v)>0){duv=d[u][v];while((y=--x)>0){dux=d[u][x];dvx=d[v][x];while(--y>0){
sa=duv+d[x][y];sb=dux+d[v][y];sc=dvx+d[u][y];min=sa<sb?(sa<sc?sa:sc):(sb<sc?sb:sc);max=sa>sb?(sa>sc?sa:sc):(sb>sc?sb:sc);
med=sa>sb?(sc>sa?sa:(sb>sc?sb:sc)):(sc>sb?sb:(sa>sc?sa:sc));dp=max!=min?(max-med)/(max-min):0;b=-1;while(dp>(++b/nb)){}q[--b]++}}}}
b=-1;while(++b<nb)print sprintf("[%.3f",(b/nb))","sprintf("%.3f[",((b+1)/nb))" "q[b]}' $infile
Every troublesome taxon indexes δx (see Holland et al. 2002) could also be estimated and sorted with the following gawk one-liner:
gawk '(NR>1){(m<(l=length(lbl[++n]=$1)))&&m=l;j=0;i=1;while(++j<n)d[n][j]=$(++i)}
END{b=" ";x=0.5;while((x*=2)<m)b=b""b;nm=(--n)*(--n)*(--n)/6;n+=3;CONVFMT="%.5f";u=n;++u;
while((v=--u)>0){while((x=--v)>0){duv=d[u][v];while((y=--x)>0){dux=d[u][x];dvx=d[v][x];while(--y>0){
sa=duv+d[x][y];sb=dux+d[v][y];sc=dvx+d[u][y];min=sa<sb?(sa<sc?sa:sc):(sb<sc?sb:sc);max=sa>sb?(sa>sc?sa:sc):(sb>sc?sb:sc);
med=sa>sb?(sc>sa?sa:(sb>sc?sb:sc)):(sc>sb?sb:(sa>sc?sa:sc));dp=max!=min?(max-med)/(max-min):0;q[u]+=dp;q[v]+=dp;q[x]+=dp;q[y]+=dp}}}}
i=0;while(++i<=n)lbl[i]=(q[i]/nm)" "substr(lbl[i]b,1,m)" "(q[i]/nm);asort(lbl);print substr("#taxon"b,1,m)" tti";i=n+1;while(--i>0)print substr(lbl[i],9)}' $infile
In complement to the δ plot method, the following gawk one-liner allows a troublesome taxon index (tti) to be estimated for each taxon from the PHYLIP-formatted distance matrix file $infile
(either square or lower-triangular). Following the same basis as the arboricity coefficient, the tti of a taxon x is the proportion of the taxon quartets containing x that strongly violate the quadrangular inequality property. Results are sorted from the more to the less troublesome taxa.
gawk '(NR>1){(m<(l=length(lbl[++n]=$1)))&&m=l;j=0;i=1;while(++j<n)d[n][j]=$(++i)}
END{b=" ";x=0.5;while((x*=2)<m)b=b""b;nm=(--n)*(--n)*(--n)/6;n+=3;CONVFMT="%.5f";u=n;++u;
while((v=--u)>0){while((x=--v)>0){duv=d[u][v];while((y=--x)>0){dux=d[u][x];dvx=d[v][x];while(--y>0)
if((a=duv+d[x][y])+(b=dux+d[v][y])+(c=dvx+d[u][y])>3*(a>b?(c>a?a:(b>c?b:c)):(c>b?b:(a>c?a:sc)))){q[u]++;q[v]++;q[x]++;q[y]++}}}}
i=0;while(++i<=n)lbl[i]=(q[i]/nm)" "substr(lbl[i]b,1,m)" "(q[i]/nm);asort(lbl);print substr("#taxon"b,1,m)" tti";i=n+1;while(--i>0)print substr(lbl[i],9)}' $infile