1. 程式人生 > 實用技巧 >plink格式資料依據染色體拆分資料、依據染色體合併資料

plink格式資料依據染色體拆分資料、依據染色體合併資料

1、準備測試資料

[root@linuxprobe test]# ls
test.map  test.ped
[root@linuxprobe test]# wc -l *  ## 46827個位點,60個樣本
   46827 test.map
      60 test.ped
   46887 total
[root@linuxprobe test]# cut -f 1 test.map | uniq -c  ## 一共 26條染色體
   5244 1
   4970 2
   4465 3
   2436 4
   2109 5
   2318 6
   2005 7
   1851 8
   1901
9 1649 10 1077 11 1528 12 1527 13 1045 14 1506 15 1395 16 1291 17 1272 18 1124 19 998 20 768 21 984 22 1005 23 662 24 892 25 805 26

2、依據染色體拆分資料

for i in `seq 26`;do plink --file test --chr $i --sheep --recode tab --out chr$i;done  ## 利用for迴圈,結合plink軟體提取每條染色體資料,(plink提前加入環境變數或者給絕對路徑)
……
……
[root@linuxprobe test]# ls
chr10.log    chr12.ped    chr15.nosex  chr18.map    chr20.log    chr22.ped    chr25.nosex  chr3.map    chr6.log    chr8.ped
chr10.map    chr13.log    chr15.ped    chr18.nosex  chr20.map    chr23.log    chr25.ped    chr3.nosex  chr6.map    chr9.log
chr10.nosex  chr13.map    chr16.log    chr18.ped    chr20.nosex  chr23.map    chr26.log    chr3.ped    chr6.nosex  chr9.map
chr10.ped    chr13.nosex  chr16.map    chr19.log    chr20.ped    chr23.nosex  chr26.map    chr4.log    chr6.ped    chr9.nosex
chr11.log    chr13.ped    chr16.nosex  chr19.map    chr21.log    chr23.ped    chr26.nosex  chr4.map    chr7.log    chr9.ped
chr11.map    chr14.log    chr16.ped    chr19.nosex  chr21.map    chr24.log    chr26.ped    chr4.nosex  chr7.map    test.map
chr11.nosex  chr14.map    chr17.log    chr19.ped    chr21.nosex  chr24.map    chr2.log     chr4.ped    chr7.nosex  test.ped
chr11.ped    chr14.nosex  chr17.map    chr1.log     chr21.ped    chr24.nosex  chr2.map     chr5.log    chr7.ped
chr12.log    chr14.ped    chr17.nosex  chr1.map     chr22.log    chr24.ped    chr2.nosex   chr5.map    chr8.log
chr12.map    chr15.log    chr17.ped    chr1.nosex   chr22.map    chr25.log    chr2.ped     chr5.nosex  chr8.map
chr12.nosex  chr15.map    chr18.log    chr1.ped     chr22.nosex  chr25.map    chr3.log     chr5.ped    chr8.nosex

[root@linuxprobe test]# wc -l *.map
1649 chr10.map
1077 chr11.map
1528 chr12.map
1527 chr13.map
1045 chr14.map
1506 chr15.map
1395 chr16.map
1291 chr17.map
1272 chr18.map
1124 chr19.map
5244 chr1.map
998 chr20.map
768 chr21.map
984 chr22.map
1005 chr23.map
662 chr24.map
892 chr25.map
805 chr26.map
4970 chr2.map
4465 chr3.map
2436 chr4.map


2109 chr5.map
2318 chr6.map
2005 chr7.map
1851 chr8.map
1901 chr9.map
46827 test.map
93654 total

3、依據染色體合併資料

[root@linuxprobe test]# cp chr1.map merge.map  ##複製染色體1為起始的合併結果
[root@linuxprobe test]# cp chr1.ped merge.ped  ##同上
for i in `seq 2 26`;do plink --file merge --merge chr$i.ped chr$i.map --sheep --recode tab --out merge;done  ## 利用for迴圈,結合plink進行合併 (這種合併僅適合樣本完全一致時合併)

[root@linuxprobe test]# wc -l merge.ped merge.map ##統計合併結果
60 merge.ped
46827 merge.map
46887 total
[root@linuxprobe test]# md5sum test.ped test.map merge.ped merge.map ## 檢查合併後是否一致
04a12361169ff0fdc77349f280f11b6b test.ped
8d4537314a4c7beaeb0da726a8c5fcba test.map
89b9ee552b2ae3b5e9fcb5282cbbb5c9 merge.ped
8d4537314a4c7beaeb0da726a8c5fcba merge.map

## ped檔案不一致是由於在部分位點基因頻率為0.5時,主次等位基因順序顛倒引起的,不影響結果

4、驗證ped檔案md5sum不一致的情況

[root@linuxprobe test]# mkdir test  ## 單獨建立資料夾,將需要驗證結果複製至該資料夾
[root@linuxprobe test]# cp test.map test.ped merge.map merge.ped test
[root@linuxprobe test]# cd test/
[root@linuxprobe test]# ls
merge.map  merge.ped  test.map  test.ped
 plink --file merge --freq --sheep --out merge;rm *.nosex   ## 利用plink統計基因頻率
 plink --file test --freq --sheep --out test ;rm *.nosex   ## 利用plink統計基因頻率
 [root@linuxprobe test]# ls
merge.frq  merge.log  merge.map  merge.ped  test.frq  test.log  test.map  test.ped
[root@linuxprobe test]# head merge.frq  ## 檢視生成檔案格式
 CHR                           SNP   A1   A2          MAF  NCHROBS
   1                      s64199.1    A    G       0.1667      120
   1              OAR19_64675012.1    G    C         0.25      120
   1              OAR19_64715327.1    A    G       0.1583      120
   1              OAR19_64803054.1    A    G         0.25      120
   1                DU281551_498.1    G    A          0.5      120
   1                      s18939.1    0    A            0      120
   1                  OAR1_88143.1    A    G       0.2083      120
   1                      s09912.1    G    C       0.3417      120
   1                      s36301.1    A    G        0.325      120
[root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | head  ## 去除行首空格,將多個空格或者製表符轉換為一個製表符,檢視前十行
CHR     SNP     A1      A2      MAF     NCHROBS
1       s64199.1        A       G       0.1667  120
1       OAR19_64675012.1        G       C       0.25    120
1       OAR19_64715327.1        A       G       0.1583  120
1       OAR19_64803054.1        A       G       0.25    120
1       DU281551_498.1  G       A       0.5     120
1       s18939.1        0       A       0       120
1       OAR1_88143.1    A       G       0.2083  120
1       s09912.1        G       C       0.3417  120
1       s36301.1        A       G       0.325   120
[root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum   ## 剔除3、4列後檢視md5sum
643867ea0404ada96aa61702bc61f678  -
[root@linuxprobe test]# sed 's/^[\t ]*//' test.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum    ##剔除3、4列後檢視買的sum
643867ea0404ada96aa61702bc61f678  -
## 以上說明每個穩點的基因頻率完全相同

5、利用diff檢查

[root@linuxprobe test]# diff merge.frq test.frq | head   ## 差異的部分基因頻率全部為0.5,主次等位基因顛倒,這種顛倒不影響結果
6c6
<    1                DU281551_498.1    G    A          0.5      120
---
>    1                DU281551_498.1    A    G          0.5      120
218c218
<    1                      s64317.1    G    A          0.5      120
---
>    1                      s64317.1    A    G          0.5      120
251c251
<    1                      s45057.1    G    A          0.5      120
[root@linuxprobe test]# diff merge.frq test.frq | tail
---
>   24              OAR24_21591643.1    A    G          0.5      120
46100c46100
<   26               OAR26_5333971.1    A    G          0.5      120
---
>   26               OAR26_5333971.1    G    A          0.5      120
46697c46697
<   26                      s52004.1    G    A          0.5      120
---
>   26                      s52004.1    A    G          0.5      120