linux系統shell實現統計 plink檔案基因頻率
阿新 • • 發佈:2021-10-30
1、
[root@centos79 test]# cat test.sh #!/bin/bash #step1 check consistence of columns temp1=`head -n 1 $1 | awk '{print NF}'` for i in $(seq `sed -n "$=" $1`) do temp2=$(sed -n "$i"p $1 | awk '{print NF}') if [ $temp2 -ne $temp1 ] then echo "$i row option exception!" echo -e "row1: $temp1 columns \nrow$i: $temp2 columns" exit fi done echo "step1 done!" #step2 check nucleotide A T G C or 0 awk '{for(i = 7; i <= NF; i++) {print $i}}' $1 | sed 's/[\t ]/\n/' | sort -u > tempped for i in `cat tempped` do if [ $i != "G" ] && [ $i != "C" ] && [ $i != "A" ] && [ $i != "T" ] && [ $i != "0" ] then echo "abnormal nucleotide: $i" exit fi done rm -f tempped echo "step2 done!" #step3 check match of map and ped map=$(sed -n "$=" $2) ped=$(head -n 1 $1 | awk '{print (NF - 6)/2}') if [ $map -ne $ped ] then echo "map and ped do not match!" exit fi echo "step3 done!" #step4 statistic for i in$(seq `head -n 1 $1 | awk '{print (NF - 6)/2}'`); do awk -v a=$i '{print $(a*2 + 6),$(a*2+5) }' $1 | sed 's/ /\n/' | sort | uniq -c | sed 's/^[\t ]\+//g' | sort -n | xargs | awk '{if(NF == 2 && $2 != 0) {print "NA", "0", $2, "1"} else if(NF == 2 && $2 == 0) {print "0", "0", "0", "1"} else if(NF == 4 && $2 != 0) {print $2, $1/($1 + $3), $4, $3/($1 + $3)} else if(NF == 4 && $2 == 0) {print "NA", "0", $4, "1"} else if(NF == 6) {print $4, $3/($3 + $5), $6, $5/($3 + $5)}}' | sed 's/ /\t/g' >> tempresult.txt; done echo "step 4 done!" #step5 add coordinate cut -f 1-2,4 $2 | paste - tempresult.txt | sed "1i chr\tsnp\tpos\tmaf\tfreq\tmaj\tfreq" > freqresult.txt rm tempresult.txt echo "finished!"
用法:
[root@centos79 test]# ls ## 測試資料 outcome.map outcome.ped test.sh [root@centos79 test]# cat outcome.map 1 snp1 0 55910 1 snp2 0 85204 1 snp3 0 122948 1 snp4 0 203750 1 snp5 0 312707 1 snp6 0 356863 1 snp7 0 400518 1 snp8 0 487423 [root@centos79 test]# cat outcome.ped DOR 1 0 0 0 -9 G G 0 0 0 0 0 0 A A T T G G G C DOR 2 0 0 0 -9 G G G G 0 0 0 0 A A T T A G C C DOR 3 0 0 0 -9 G G C C 0 0 G G G G T T A G G C DOR 4 0 0 0 -9 G G C C 0 0 G G G G A A G G G G DOR 5 0 0 0 -9 G G C C 0 0 G G G G A A A G G C DOR 6 0 0 0 -9 G G C C 0 0 G G G G A A A A C C [root@centos79 test]# bash test.sh outcome.ped outcome.map step1 done! step2 done! step3 done! step 4 done! finished! [root@centos79 test]# ls ##生成的結果 freqresult.txt outcome.map outcome.ped test.sh [root@centos79 test]# cat freqresult.txt chr snp pos maf freq maj freq 1 snp1 55910 NA 0 G 1 1 snp2 85204 G 0.2 C 0.8 1 snp3 122948 0 0 0 1 1 snp4 203750 NA 0 G 1 1 snp5 312707 A 0.333333 G 0.666667 1 snp6 356863 A 0.5 T 0.5 1 snp7 400518 A 0.416667 G 0.583333 1 snp8 487423 G 0.416667 C 0.583333
2、plink軟體驗證, 結果一致
[root@centos79 test]# plink --file outcome --freq --out verify > /dev/null; rm *.log *.nosex [root@centos79 test]# ls freqresult.txt outcome.map outcome.ped test.sh verify.frq [root@centos79 test]# cat freqresult.txt chr snp pos maf freq maj freq 1 snp1 55910 NA 0 G 1 1 snp2 85204 G 0.2 C 0.8 1 snp3 122948 0 0 0 1 1 snp4 203750 NA 0 G 1 1 snp5 312707 A 0.333333 G 0.666667 1 snp6 356863 A 0.5 T 0.5 1 snp7 400518 A 0.416667 G 0.583333 1 snp8 487423 G 0.416667 C 0.583333 [root@centos79 test]# cat verify.frq CHR SNP A1 A2 MAF NCHROBS 1 snp1 0 G 0 12 1 snp2 G C 0.2 10 1 snp3 0 0 NA 0 1 snp4 0 G 0 8 1 snp5 A G 0.3333 12 1 snp6 A T 0.5 12 1 snp7 A G 0.4167 12 1 snp8 G C 0.4167 12