#! /bin/bash
#------------------------------------------------------------------------------
#$Author: saulius $
#$Date: 2019-01-15 18:48:03 +0000 (Tue, 15 Jan 2019) $
#$Revision: 5402 $
#$URL: file:///home/saulius/svn-repositories/paskaitos/VU/software/assignment-evaluation/trunk/scripts/do-check-BPKM-1st-kmer--correct-count-files $
#------------------------------------------------------------------------------
#*
# Check of the k-mer files are computed correctly.
#**

set -ue
## set -x

DIRECTORY="$1"

ID='$Id: do-check-BPKM-1st-kmer--correct-count-files 5402 2019-01-15 18:48:03Z saulius $'

cd "${DIRECTORY}"

KMER_LENGTHS=(1 2 3 4 6 8)

EMPTY_FILE_MD5=d41d8cd98f00b204e9800998ecf8427e

for KMER_LENGTH in ${KMER_LENGTHS[*]}
do

    KMER_MD5=$(find -iname '*fasta' -print0 \
                      | grep -zZ -v '/\.' \
                      | xargs -0 --no-run-if-empty cat \
                      | k-mers ${KMER_LENGTH} \
                      | grep -v '^#' \
                      | md5sum | awk '{print $1}')
    
    KMER_MD5_DOWN=$(find -iname '*fasta' -print0 \
                      | grep -zZ -v '/\.' \
                      | xargs -0 --no-run-if-empty cat \
                      | k-mers ${KMER_LENGTH} \
                      | grep -v '^#' \
                      | sort -k1,1 -n \
                      | md5sum | awk '{print $1}')

    KMER_MD5_UNSORTED=$(find -iname '*fasta' -print0 \
                      | grep -zZ -v '/\.' \
                      | xargs -0 --no-run-if-empty cat \
                      | k-mers ${KMER_LENGTH} \
                      | grep -v '^#' \
                      | md5sum | awk '{print $1}')

    SEPARATE_KMERS_MD5=$(find -iname '*fasta' \
                                | while read LINE
                            do
                                k-mers ${KMER_LENGTH} "${LINE}" \
                                    | grep -v '^#' \
                                    | sort -k1,1 \
                                    | md5sum | awk '{print $1}'
                            done \
                                    | tr "\n" "|" \
                                    | sed 's/|$//'
                      )

    SEPARATE_KMERS_MD5_DOWN=$(find -iname '*fasta' \
                                     | while read LINE
                                 do
                                     k-mers ${KMER_LENGTH} "${LINE}" \
                                         | grep -v '^#' \
                                         | sort -k1,1 -n \
                                         | md5sum | awk '{print $1}'
                                 done \
                                         | tr "\n" "|" \
                                         | sed 's/|$//'
                           )

    SEPARATE_KMERS_MD5_UNSORTED=$(find -iname '*fasta' \
                                     | while read LINE
                                 do
                                     k-mers ${KMER_LENGTH} "${LINE}" \
                                         | grep -v '^#' \
                                         | md5sum | awk '{print $1}'
                                 done \
                                         | tr "\n" "|" \
                                         | sed 's/|$//'
                           )

    KMERS_MD5_REGEXP="${KMER_MD5}|${KMER_MD5_DOWN}|${KMER_MD5_UNSORTED}"
    KMERS_MD5_REGEXP="${KMERS_MD5_REGEXP}|${SEPARATE_KMERS_MD5}"
    KMERS_MD5_REGEXP="${KMERS_MD5_REGEXP}|${SEPARATE_KMERS_MD5_DOWN}"
    KMERS_MD5_REGEXP="${KMERS_MD5_REGEXP}|${SEPARATE_KMERS_MD5_UNSORTED}"

    if ! find -type f -print0 \
            | xargs -0 md5sum \
            | grep -v ${EMPTY_FILE_MD5} \
            | grep -P "${KMERS_MD5_REGEXP}"
    then

        cat <<EOF
-- [AUTO;INTERFACE;$(echo $ID | sed 's/\$//g')]
   Nerastas teisingas rezultatas su ${KMER_LENGTH}-erų dažniais (-10 balų)

EOF

    fi

done
