From 3e2aa6e04eec9187eb124b7d8a7f4167cde9fc70 Mon Sep 17 00:00:00 2001 From: JinzhuangDou Date: Sun, 25 Feb 2024 21:21:10 -0600 Subject: [PATCH] Add files via upload --- src/AlleleDropOut_germline.R | 187 ++++++++ src/LDrefinement.R | 36 +- src/Monopogen.py | 155 +------ .../Single_Cell_Ftrs_Pos.cpython-37.pyc | Bin 9849 -> 9842 bytes src/__pycache__/alleles_prior.cpython-37.pyc | Bin 3037 -> 3030 bytes src/__pycache__/bamProcess.cpython-37.pyc | Bin 0 -> 3690 bytes src/__pycache__/base_q_ascii.cpython-37.pyc | Bin 4263 -> 4256 bytes .../calc_variant_prob.cpython-37.pyc | Bin 2746 -> 2739 bytes .../genotype_prob_mat.cpython-37.pyc | Bin 4052 -> 4045 bytes src/__pycache__/germline.cpython-37.pyc | Bin 0 -> 6884 bytes src/__pycache__/hzvcf.cpython-37.pyc | Bin 11217 -> 11210 bytes src/__pycache__/mp_genotype.cpython-37.pyc | Bin 4147 -> 4140 bytes .../nu_genotype_single_cell.cpython-37.pyc | Bin 3465 -> 3458 bytes src/__pycache__/nu_prob_mat.cpython-37.pyc | Bin 4904 -> 4897 bytes src/__pycache__/somatic.cpython-37.pyc | Bin 0 -> 12829 bytes src/__pycache__/utils.cpython-37.pyc | Bin 14683 -> 14676 bytes src/bamProcess.py | 163 +++++++ src/haplotype_somatic.R | 101 +++++ src/index_motif.py | 0 src/somatic.py | 426 +++++++++++++----- src/svm_phasing_sub.R | 268 +++++++++++ 21 files changed, 1091 insertions(+), 245 deletions(-) create mode 100644 src/AlleleDropOut_germline.R create mode 100644 src/__pycache__/bamProcess.cpython-37.pyc create mode 100644 src/__pycache__/germline.cpython-37.pyc create mode 100644 src/__pycache__/somatic.cpython-37.pyc create mode 100644 src/bamProcess.py create mode 100644 src/haplotype_somatic.R create mode 100644 src/index_motif.py create mode 100644 src/svm_phasing_sub.R diff --git a/src/AlleleDropOut_germline.R b/src/AlleleDropOut_germline.R new file mode 100644 index 0000000..89bf0da --- /dev/null +++ b/src/AlleleDropOut_germline.R @@ -0,0 +1,187 @@ +library(reshape2) +calcP <- function(geno_vec, dis_vec, dis_limit=20000){ + + dis_bin <- c(100, 1000, 5000, 10000, 20000, 500000, 1000000000000000000) + prob_bin <- c(1, 0.93, 0.78, 0.71, 0.69, 0.64, 0) + + prob_bin[dis_bin>dis_limit] <- 0 + + pos_match <- which(geno_vec%in%c("000","111")) + dis_match <- dis_vec[pos_match] + + match_p <- 0 + for(pos in dis_match){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + match_p <- match_p + p/sqrt(pos) + } + + pos_mismatch <- which(geno_vec%in%c("010","101")) + dis_mismatch <- dis_vec[pos_mismatch] + + + mismatch_p <- 0 + for(pos in dis_mismatch){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + mismatch_p <-mismatch_p + p/sqrt(pos) + } + + if(is.na(match_p) | is.na(mismatch_p)){p<-(-1)} + else{ + if((match_p + mismatch_p) ==0){ + p <- (-1) + }else{ + p <- match_p/(match_p+mismatch_p) + } + } + + return(p) +} + + + + + +args = commandArgs(trailingOnly=TRUE) + +chr <- args[1] +out <- args[2] +len <- as.numeric(args[4]) +bin <- as.numeric(args[5]) + +load(paste0(args[3],chr,".input.image.RDS")) + + + +germline <- mat_qc[!mat_qc$V4=="0|0" & !mat_qc$V4==".|.",] + +print(dim(germline)) + +y <- germline +dis <- colsplit(y$V1,":", c("id1","id2"))[,2] +check <- germline[,c(1,2,3,4)] +check$prob <- (-1000) +check$tol <- 0 +check$dis <- (-1000) + +check$prob_phy <- (-1000) +check$tol_phy <- 0 + +pos_lst <-list() +sta <- 0 +for(i in seq(5,ncol(y),1)){ + vec <- y[,i] + pos_lst[[i]] <- which(!vec=="0|0" & !y$V4==".|.") + sta <- sta + length(which(!vec=="0|0" & !y$V4==".|.")) +} + + +dis_rd <-c() +trio_hap_rd <-c() +cnt <- 0 +dis_limit <- 20000 + +lst <- germline$V2 +dis_rd2 <-c() +hap_rd2 <-c() + +for(snv in lst[seq(1,length(lst),bin)]){ + cnt=cnt+1 + #print(paste0("scanning ",cnt," now...")) + snv_index <- which(y$V2==snv) + snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",) + + # remove the first column since they are not genotypes + snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4] + + geno_vec <-c() + dis_vec <-c() + dis_phy1 <- c() + dis_phy2 <- c() + # identify haplotypes for each element + index <-0 + for(pos in snv_index_noMisCol){ + # element index (snv_index, pos) + pos_noMisCol <- pos_lst[[pos]] + lower_vec <- which(pos_noMisColsnv_index) + + if(length(lower_vec)>=1 & length(upper_vec)>=1){ + lower <- pos_noMisCol[max(lower_vec)] + upper <- pos_noMisCol[min(upper_vec)] + tp <- y[snv_index, pos] + tp <- gsub("/", "|", tp) + geno <- c(y[lower,pos], tp, y[upper, pos]) + + part <-colsplit(geno,"\\|",c("i1","i2"))[,2] + part[part>1] <- 1 + flag <- paste0(part[1],part[2],part[3]) + hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2])) + dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index]) + + dis_vec <- c(dis_vec, dis[upper]-dis[lower]) + dis_phy1 <- c(dis_phy1, dis[snv_index]-dis[lower]) + dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index]) + geno_vec <-c(geno_vec,flag) + } + } + + sum <- 0 + sum_equal <- 0 + + if(!is.null(dis_phy1) & sum(dis_phy10){ + p <- sum_equal/sum + check$prob_phy[cnt] <- p + check$tol_phy[cnt] <- sum + print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p, " ",len)) + } + + if(length(geno_vec)>0 ){ + dis_rd <-c(dis_rd, dis_vec) + trio_hap_rd <-c(trio_hap_rd, geno_vec) + #print(table(trio_hap_rd)) + sel <-list() + sel <- table(geno_vec[dis_vec<1000000]) + + if(length(sel)>0){ + check$prob[cnt] <- 0 + check$tol[cnt] <- sum(dis_vec=2] + filter <- names(region_cnt)[region_cnt>=4] neg <- x[(x$region%in% filter & x$genotype==".|."),] pos <- x[!x$genotype==".|.",] svm$neg <- neg @@ -49,7 +48,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ label$pos[label$pos=="None"] <- NA # using median values to replace the missing values train_x_pos <- impute(as.matrix(data.matrix(label$pos[,features])), what="median") - train_x_pos[is.na(train_x_pos)] <- 0 # using the minior value of QS vec <- train_x_pos[,colnames(train_x_pos)=="QS"] @@ -59,7 +57,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ label$neg <- as.data.frame(label$neg) label$neg[label$neg=="None"] <- NA train_x_neg <- impute(as.matrix(data.matrix(label$neg[,features])), what="median") - train_x_neg[is.na(train_x_neg)] <- 0 vec <- train_x_neg[,colnames(train_x_neg)=="QS"] vec[vec>0.5] <- 1- vec[vec>0.5] train_x_neg[,1] <- vec @@ -67,7 +64,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ label$test <- as.data.frame(label$test) label$test[label$test=="None"] <- NA test_x <- impute(as.matrix(data.matrix(label$test[,features])), what="median") - test_x[is.na(test_x)] <- 0 vec <- test_x[,colnames(test_x)=="QS"] vec[vec>0.5] <- 1- vec[vec>0.5] test_x[,1] <- vec @@ -83,10 +79,12 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ print(p) } dev.off() - + + model <- svm(as.matrix(rbind(train_x_pos, train_x_neg)), as.factor(c(train_y_pos,train_y_neg)), probability=TRUE) + print("Done") pred <- predict(model, as.matrix(test_x), probability=TRUE) prob <- attr(pred, "probabilities") prob <-as.data.frame(prob) @@ -410,10 +408,11 @@ somaticLD <- function(mat=NULL, svm=NULL, dir=NULL, region=NULL, min_size=50){ dt <- fread(mat_gz) dt <- data.frame(dt) -dt$V10[is.na(dt$V10)] <- ".|." +dt$V10[dt$V10=="nan"] <- ".|." rownames(dt) <- paste0(dt$V1,":",dt$V2,":",dt$V3,":",dt$V4) -cellName <- read.table(file=paste0(outdir,region,".cell.txt"),header=F) +cellName <- read.csv(file=paste0(outdir,region,".cell_snv.cellID.filter.csv"),header=T) +cellName <- cellName[,2] for(j in seq(11,18,1)){ dt[,j] <- as.numeric(dt[,j]) @@ -422,7 +421,6 @@ meta <- dt[,1:18] colnames(meta) <-c("chr","pos","ref","alt","Dep","dep1","dep2","dep3","dep4", "genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F") - mutation_block <- SNV_block(summary=meta) svm_in <- SVM_prepare(mutation_block) svm_out <- SVM_train(label =svm_in,dir=outdir, region=region) @@ -431,7 +429,25 @@ final <- somaticLD(mat=dt, svm=svm_out, dir=outdir, region=region) mut_mat <- dt[rownames(final$out),] -colnames(mut_mat) <-c(colnames(meta),cellName[,1]) +colnames(mut_mat) <-c(colnames(meta),cellName) + + +#### re-assign the sequencing depth for each allele + +for(i in seq(1, nrow(mut_mat),1)){ + N_wild <- 0 + N_mut <- 0 + vec <- mut_mat[i,seq(19, ncol(mut_mat),1)] + sta <- unname(unlist(vec)) + sta <- table(c(sta,"0/1","1/0","1/1")) + sta <- sta - 1 + N_alt <- sta["0/1"] + sta["1/1"] + N_ref <- sta["1/0"] + sta["1/1"] + LDrefine_somatic[i,6] <- N_ref + LDrefine_somatic[i,7] <- N_alt + LDrefine_somatic[i,8] <- N_ref + N_alt + LDrefine_somatic[i,12] <- N_alt/(N_ref+N_alt) +} write.csv(final$LDrefine_somatic, paste0(outdir,region,".putativeSNVs.csv"),quote=FALSE,row.names = FALSE) write.csv(final$LDrefine_germline2, paste0(outdir,region,".germlineTwoLoci_model.csv"),quote=FALSE, row.names = FALSE) diff --git a/src/Monopogen.py b/src/Monopogen.py index 1836746..d656463 100644 --- a/src/Monopogen.py +++ b/src/Monopogen.py @@ -17,6 +17,7 @@ import numpy as np import gzip from pysam import VariantFile +from bamProcess import * from germline import * from somatic import * import multiprocessing as mp @@ -46,6 +47,18 @@ +def error_check(all, output, step): + job_fail = 0 + for id in all: + if id not in output: + logger.error("In "+ step + " step " + id + " failed!") + job_fail = job_fail + 1 + + if job_fail > 0: + logger.error("Failed! See instructions above.") + exit(1) + + def germline(args): logger.info("Performing germline variant calling...") @@ -74,13 +87,13 @@ def germline(args): jobid = record[0] + ":" + record[1] + "-" + record[2] bam_filter = args.out + "/Bam/" + record[0] + ".filter.bam.lst" imputation_vcf = args.imputation_panel + "CCDG_14151_B01_GRM_WGS_2020-08-05_" + record[0] + ".filtered.shapeit2-duohmm-phased.vcf.gz" - cmd1 = samtools + " mpileup -b " + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 -t DP -d 10000000 -v " + cmd1 = samtools + " mpileup -b" + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 -t DP -d 10000000 -v " cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + " norm -m-both -f " + args.reference cmd1 = cmd1 + " | grep -v \"\" | grep -v INDEL |" + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz" #cmd2 = bcftools + " view " + out + "/germline/" + jobid + ".gl.vcf.gz" + " -i 'FORMAT/DP>1' | " + bcftools + " call -cv | " + bgzip + " -c > " + args.out + "/SCvarCall/" + jobid + ".gt.vcf.gz" cmd3 = java + " -Xmx20g -jar " + beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" - cmd4 = "zless -S " + out + "/germline/" + jobid + ".gp.vcf.gz > " + out + "/germline/" + jobid + ".germline.vcf" + cmd4 = "zless -S " + out + "/germline/" + jobid + ".gp.vcf.gz | grep -v 0/0 > " + out + "/germline/" + jobid + ".germline.vcf" cmd5 = java + " -Xmx20g -jar " + beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" f_out = open(out + "/Script/runGermline_" + jobid + ".sh","w") if args.step == "varScan" or args.step == "all": @@ -106,19 +119,6 @@ def germline(args): -def sort_chr(chr_lst): - # sort chr IDs from 1...22 - chr_lst_sort = [] - for i in range(1, 23): - i = str(i) - if i in chr_lst: - chr_lst_sort.append(i) - i_chr = "chr"+i - if i_chr in chr_lst: - chr_lst_sort.append(i_chr) - chr_lst = chr_lst_sort - return chr_lst - def somatic(args): @@ -138,120 +138,28 @@ def somatic(args): region_lst.append(region) chr_lst = list(set(chr_lst)) - if args.step=="featureInfo" or args.step=="all": - logger.info("Get feature information from sequencing data...") - + + logger.info("Get feature information from sequencing data...") joblst = [] for id in region_lst: - joblst.append(id+">"+args.out) + joblst.append(id+">"+args.out+">"+args.app_path) + with Pool(processes=args.nthreads) as pool: result = pool.map(featureInfo, joblst) + error_check(all = chr_lst, output = result, step = "LDrefinement") if args.step=="cellScan" or args.step=="all": - logger.info("Get single cell level information from sequencing data...") - + logger.info("Collect single cell level information from sequencing data...") chr_lst = sort_chr(chr_lst) - - run = 1 - if run: - joblst = [] - for id in chr_lst: - joblst.append(id+">"+args.out+">"+args.app_path) - with Pool(processes=args.nthreads) as pool: - result = pool.map(bamExtract, joblst) - - ####### merge bams from different chromosomes - bamlst = [] - print(chr_lst) - for chr in chr_lst: - bam_filter = out + "/Bam/" + chr + ".filter.targeted.bam" - bamlst.append(bam_filter) - print(bamlst) - output_bam = out + "/Bam/merge.filter.targeted.bam" - pysam.merge("-f","-o",output_bam,*bamlst) - os.system(samtools + " index " + output_bam) - - if run: - os.system("mkdir -p " + out + "/Bam/split_bam/") - cell_clst = pd.read_csv(args.barcode) - df = pd.DataFrame(cell_clst, columns= ['cell','id']) - df = df.sort_values(by=['id']) - args.keep = float(args.keep) - if args.keep < 1: - dis = np.cumsum(df['id'])/np.sum(df['id']) - N = sum(dis>(1-args.keep)) - df = df.iloc[-(N):] - cell_lst = df['cell'].unique() - joblst = [] - - for cell in cell_lst: - para = "merge" + ":" + cell + ":" + args.out + ":" + args.app_path - joblst.append(para) - - with Pool(processes=args.nthreads) as pool: - result = pool.map(bamSplit, joblst) - - if sum(result)==0: - logger.error("No reads detected for cells in " + args.barcode + ". Please check 1) the input cell barcode file is matched with bam file; 2) the cell barcode name has the same format shown in bam file. For example XX-1!") - logger.error("Failed! See instructions above.") - exit(1) - - # generate the bam file list - cell_bam = open(out + "/Bam/split_bam/cell.bam.lst","w") - for cell in cell_lst: - cell_bam.write(out + "/Bam/split_bam/" + cell + ".bam\n") - cell_bam.close() - - - region_lst = [] - if args.winSize=="10MB": - region_file = args.app_path + "/../resource/GRCh38.region.10MB.lst" - if args.winSize=="50MB": - region_file = args.app_path + "/../resource/GRCh38.region.50MB.lst" - with open(region_file) as f_in: - for line in f_in: - record = line.strip().split(",") - if(len(record)==1): - region = record[0] - if(len(record)==3): - region = record[0] + ":" + record[1] + "-" + record[2] - if record[0] in chr_lst: - region_lst.append(region) - - joblst = [] - for id in region_lst: - record = id.strip().split(":") - chr = record[0] - joblst.append(id+">"+chr+">"+args.out+">"+args.app_path+">"+args.reference) - - - with Pool(processes=args.nthreads) as pool: - result = pool.map(jointCall, joblst) - error_check(all = region_lst, output = result, step = "cellScan:joint calling") - - - ##### merge vcfs from multiple regions - - for id in chr_lst: - tp = "" - for reg in region_lst: - if re.search(id+":", reg): - tp = tp + " " + args.out+"/somatic/" + reg + ".cell.gl.vcf.gz" - cmd = args.app_path + "/bcftools concat -o " + args.out+"/somatic/" + id + ".cell.gl.vcf.gz " + tp + " -O z" - print(cmd) - output = os.system(cmd) - joblst = [] for id in chr_lst: - joblst.append(id+">"+args.out) + joblst.append(id+">"+args.out+">"+args.reference+">"+args.barcode) with Pool(processes=args.nthreads) as pool: - result = pool.map(vcf2mat, joblst) - error_check(all = chr_lst, output = result, step = "cellScan:vcf2mat") - - + result = pool.map(bam2mat, joblst) + error_check(all = chr_lst, output = result, step = "cellScan") if args.step=="LDrefinement" or args.step=="all": logger.info("Run LD refinement ...") @@ -265,18 +173,6 @@ def somatic(args): -def error_check(all, output, step): - job_fail = 0 - for id in all: - if id not in output: - logger.error("In "+ step + " step " + id + " failed!") - job_fail = job_fail + 1 - - if job_fail > 0: - logger.error("Failed! See instructions above.") - exit(1) - - def preProcess(args): logger.info("Performing data preprocess before variant calling...") @@ -392,9 +288,6 @@ def main(): help="The app library paths used in the tool") parser_somatic.add_argument('-t', '--nthreads', required=False, type=int, default=22, help="Number of jobs used for SNV calling") - parser_somatic.add_argument('-w', '--winSize', required=False, default="50MB", - choices=['10MB','50MB'], - help="Split the chromosome into small segments for cell level sequencing information collection. Setting 10MB will generate more jobs but be faster") parser_somatic.add_argument('-s', '--step', required=True, choices=['featureInfo', 'cellScan' , 'LDrefinement', 'monovar', 'all'], help="Run germline variant calling step by step") diff --git a/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc b/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc index 8fec6aafe73ab21c49fc99932a3dd1d183e257d1..a5a64d40ce0a93279966ab7e8e47f3f2e6a30d52 100644 GIT binary patch delta 74 zcmezA^T~(DiIvAH)Cv diff --git a/src/__pycache__/alleles_prior.cpython-37.pyc b/src/__pycache__/alleles_prior.cpython-37.pyc index c8603e8066a80b3281baa5c79980c4d2477be707..c04958913da00b0f65cca7dc8595b85ba5d7d751 100644 GIT binary patch delta 74 zcmcaBeodUmiId delta 81 zcmca6epj5wiIq?2Kw1lXD|43!EdOdKbfRAr3wfJ`+^b+vS_)u4zmQG^R5n(F0wf5f}>1z*J}jmcmj{QfLRZLMLz( zx`C^(9F!G$fv2z%RDovV4r)P7*SOiQPa8@$m0VKt3_m;5XF9WH#!L^E`7$^9^{0;b z3*qz})A{nuSZJSU82$51U)4x2SXs=t!b&qExWKLHDt`~{7x~5M``qe(XQo5jntm|T zf=m3;)JNN8W;2JmFR>!}E>5rTOH^WI=KWp^K4cYE1%8*+SRMEgYp^Epd(3#GwU)-0 zy~lgpA4KuckB2!IyHS_>>8_u3chb>aKIGmjGW2#!mz7^fB94anojBp++9)3JBp&jt zox~k!c6WQ@_HUj?SvQXdeDan5JmH5t8Abzs{aQBhpYKA8YA6nPmPI{2k@|B?Tgvj| zgm-ay>Q{lJE4_s;+d*rg^|c8ub7B;@*cEMJ=6a#Oz!7s}YNE}Y8566p`qs2G(02{>;-Rfvcwrgn0lk814r&!(1y}`Efi++aSO?aD4PXP<1U7+7z$M@r z;2Gdq;Mu~NE*BnM-2U*1_5_anKzpctsbOE|W}2>{UtxZJy29MTT!F*D0#qwgjd?hs z{*Bhkyp>hwJxs%;*S2w8dt&V5gVA~?8c0KQr19thzHMnd>)=l8JsvSG#?34mkRBY$l9gG2wd?vHJ##y%PdTNk8*naDF_X(ZrDfMY1j+RY^H4F`PgHiB+fI(Zth zxQj8nJn!zwy5PH9@L`vS${~$`@{8*N{aW4{bX~>WA@N{&{Z(nvFDXU7?oTNPb_ce?gV&`gvXa5c+RI*+PXr70QoM=pm4g?&ON0Ul<6s zYZC*(Y`mc0HVW;jE!GQtin3xbvryq)n5w6#dQ24}%_ng738(??Y;C_H?s&BVA#8}2 zDN7<6_Bf203!HB>;zQQ5qyr0vNtOrIMIp>m_=+9}rI@U%%sLlj4e3%hb^H>wY#Bl` zEWN2t>n+Y~>EaW#zd3UQl@dO6=HDv%nrbs`3IYR(fVh~kswpZnfka=RD$tisBsD34 zl57xWjm7}0=aX$|y)!Zqntog#2)QTEgdTM0%y-ACauP|(EN|2i*EJ*-{rS7wIz`rAg}<|@;wqBM7bHamkoSXlq9y-ezPZCVFu ze@owdP2Bz2?brHsLDw&S1o6rsVM2_{?Gx%6f6(5H2JHbCJ-&`t2#?%4IHJdM&WQem zgYnXMco+Qlg*5JMjT=fK8-X||P}_>*27*$MmaZr)Fm7!I?sl5rNe_l>>@3)^)u7i* z#3KM{3-Shmghf+YVu@BVK`238q@9B-53!37JR*GRF$4#D@CZRI&XLk(5?4rk2!YwH zQ6wVik|ak~?k=q^PZDW@Go%Y&4@VK`$~`&0^w6>383`_;cUT(v2yehJS7fAG^i1kG z+6L?!Q2-M>QntVzvg;5U_+SzJCSld^5EWKUh4LF~9d7gz?C5kq(s@o+ngo&;T%By(^5pkYk=4iR7 zA&#n(dQqpdP3rr9f1(vNN;VKTRc0WP{!<_qQ%~~>{Q1VIKZ{Rc9r06$SHz=lJ9Qas zzpI|+vEAc{x`Ylr^W^b5TA{$b3Y`Qs&jn2LeFHdyPV`JS5R3i^A3W9mcy%Ep&B z-mc%<-nxC?FQC5G^>6regme()ako8gtg9XNlJ&#x?s{)5bAKm`E1X8Ksk?y&f_aFu>kH9w*Z3C22?e9IA>&kZ&NdmR@$y8Opu3%g>*xelwls5c26xCIw|p&rLEhW zzq%`Hk8&(K_#8V;(C_Rm3RMie14oK-@UbkP7CodRi_t2ZP|vI44Q0l+E{2jIm9|KC tW010ggx{bj$*w_IigHda=ORiIre?^|hiIXOH8o72|Tp6NIq&!jC&vOO&c9@`za*)f}NISU%=(J9A%7MBJFhP)G&N&4^4mssiIRya%_!8te@G0N_idyV;BN0*W zfBW+9|F6Zt!JLM_>EVCl($Td4q{hi#7M0KOg}b_@F^%aBt(-u;tgE(BHWW0=rh-=4 zQZQ9cDVQ#&70i?~3fg5`!E8CJV6Hr%V7@%4V4++9G#Z)aP>`^4oM-w|t#s)%QDeQN#{+-t3eWZBir3(1>OV$dJr06KXRzh9c*FB~IM?v1v02-w zcjmr6biiIyN?+1m-stL)&WvqCT-g~@Gt8bD>B)Gc1F;G*J7dp5 zNIc0}?_7B73Y;&w2p!LN_{IjW$vd&N(e`Vdfti`ZuJd!}>%;ELk`<>LLA}mJY&h2;=ldAeyc$8L{PxdKjB1f2WBdT=C8QIT4E5=B>G~CR|zDoU7KVb6YHEFQDh1 zU*n+^2XkwIAGCrx_ba;#vp=1~ia8w8BrMLh4&w2a@O)WmonEt2_jb8|lMtZp^_YA^Y&pg(8*0I*dm>J%XDQ3RXztN?>mF{IQ z+dkIA74%xYY|^69BmLkXWscgfQ2*|4yT*~e-;0bT?MQ$5o30s|v}z zu3C4bha;F%czX_EVfyujSA@IBXv-+!e&(Y`{>zinx9bUSxsh|kaM1Z9Oi$K-|H__4ZW zQX{&3qrcF6rR*~IqV8j1g}jLop5=Vivx$` z!`XC0M+N}9+~NCfO*RgkTaI+AGYo2|=}FFYY&T?h$BFg% zPU}1-YJo3Z&kx^obw!e(EuEGKT3k4ExN~EEe_?*UTGBYtygENYwGEzkcO^{;2cfJ{{ild{8&=9jIc3#1r0{@1C#`c-6)6fh!%q z-K^p+RWA29XgBW=yDxx-AcD*2)We1DbmzshO)kLFp%Vr$BAdjsdm^Bc_)Ko{6J9&x z`dLR2RSm+Rgo^6TfVP2IA!CAD&Xy+1BQ+9(3K6@~Cw6QEVVr6KPkw9>;mJ-4uN4>W zg&`NTa#@G~CW`nePJ1D-Yk44fYo!4m5@Zx&OKp(~TMbVN@~+|(`S_16fu%F#5~r&Wf;kDD#N%4!Is(J7p;Tm8FP*!>@{yN%sZKXF;bjhq<6z;+%&#=W}(=?UQqQ1vUV4osvUQ3Z^*^)BO8>-BNW6=JoXCo!_6|TRBpN z_0BDbf&(uGasZXI0kI%uwc&;mGW7$>zH@iQYryq_7NeE&poJAQ7xK06#e-Y45j3OE z8}Bm}ogdMWg4ao^f9MEt0i=FCfudv2K_ku8HbrM>7SnMqG7CMf+t$a&|I4hEisB+JDn6uQ3PqefC7rlTol03i9WYPCMX%XvOQoLRj)Ev8(}esu zC0rjq>nF5>2^AwgqO~7T;ZSh}#i?Szz2gjU4bpQ#ZmgK0VwMVWo6qTmsZ+fGTPZ3q z1`-;!a^fDBF*~GdYQ_|l!4$-O90H%yZ3zFokymO#{0!r#!f&CG!KZ}(m#8qkt3w

4P}Po zRJGoLX11&Sj?Q?MyY&X2oxi=ptDk;4yW#$1cFPr~{h$89FWZfU&t`9b=2CxbLm8^A zI=PltgB4J{)|R{Lb_UP6V29l|`kWZSV>%z4IAQPctUv%o>`6>a=xV#I7APq<#U<(^ zOqMLAzRSZf#^lX~6&X~N9fC=rE~4?fWa=9##$lEi#NOkfk7*$VY?^KvIS@w>#%pAu z>2ClZSsg`&73g4)0a}y+-CdK2fVz2Xkbc@S$l>m$_MHf!$fO2BjdjzJ5v5sbQUi$q z?`k_Yq2ZvZL83I~Wxh4P`%O2q|JTr?ARX3x)3ve2-hwmV&EkY~l-U9SyE*h_qFf~d zXam|&7G(-8f?2>EAk7*8901G%<^cx*2LTI!1;8P|A;2PF5pWoAILh=!q8z!_Z@b~R z*XS{vc#JD3kwHKJVdFhuyTldmU0_*iu^f8w(sz0XlHMc)q6`HB+QD`4hd^s5BbzN! zZm}V%vGMd*_|{`%uZqAry$QF0i%zERh<7Zfx$C^qolaV5H78MZrE%Ez18`tGc(38r z{U%T1een?c#~E-1^J)?<0!LsQ7f#hig`8@*BeaIIr@T%Je2u_D8B+cieISi7LN7y;|zQo>W#CW z4~?j*w$Jw}gjy&-z61ZNEjN@{#f4-QhZ9E{JJ#KIp+?*fnL`bJ z50o288@1qHbm9M?Mj<>rAJo#QWmM`Q6TxR8K2_X^U>{tYg{0cx_*6H`a^2kiRFs9p zjdus4fnJ)<5q^XW6eKUV<3HC>&ml$gcQVtnSsp=StyHtld9|8LX__fvEiuTsU4-cWkne_0!e|~vr#|` za17stD9^^fHM>Lm&my~5q&wiwE8SvLL|8p>Okw$5)JLe^9ag;;Rqqqki#mFTqx3eV z5UxZ+z0s&huQ<|ouHcP!Z%oPePy$u*z+{uS0pagGH&Pf-n8AMkA}_rE-jVqVIm7pl zbw{ER%G0fjJ0QDK1PR|7kdb`T-LEvg5a9*bkhFLw+uVj(cV=1;`ME_fIpx($NgtwjR*z>xIk@hIWZZsUQbiWr)izl7LBQob_7s~dd)zvT8 z9(>f+C^*Qt#sTTdQ<(mz*GIY8R5r$vy=ry5dF`gTrCB?&0XFZa_+ z#(LRa59A8vj7o;s$GX`5f*+``79HfNlx?6|;pN@O6+}>NhY}v5BFDdDa^DLvaL0l! z{v1808j@~k;!`)IjT6rUl1*u39Xe`U8c!3;YOE{CD4FN@>oY9DmF58AH1H2w@a~1u zqV3;bTlx=72}v0fs%>4oLanqA+h7eOc42l(=tyi&hLX&%doV(qZRs^)tKJA;#06LU z45NiiI+(a!g+GQAGE&fR#+e(&MhocxBrcFNHvAUW!Tl8U#yAIFfM=*?4>ggZ!GLPf zIYzT3$SqEIK7+ttYz4~ae^7!7fhs3j#eo#5^ z-MIMp(c|USN9)U##d}YemmWQdGxt{>RH%{#Wmc0cI`*NRE0rYwV8gp#rGE~@sioz` zFCWB%PbJQ5KE&g|#UyJDiIVeleWKiIPLQGRKGepX6; iseW;?Z)viAL1sZ}PG(*zkQ=56jWPgw;~d=p diff --git a/src/__pycache__/mp_genotype.cpython-37.pyc b/src/__pycache__/mp_genotype.cpython-37.pyc index 6c5ee751217c704539190016854f41b407c14cfe..4c1476be8b6ee63143888afa51edfafbc0d8efda 100644 GIT binary patch delta 74 zcmdn2uttH$iIReXD4suxx=RKt6x-Hl$>F#Uz}W&SdyHfpOl=WpOundY6ui@ b%u7x!E-9+i_s!4CFUU_%&D$);p3MdTb0HY_ delta 81 zcmZpY?v&8l$>FxpOl=WpPihMnirp5lwVq)pOund is$ZP!Tbit2kXewLlbM$ap|L%*oFC^^GWKPfp$KRY=iH7`EBD8IBoKPx4_ iRKGacw=`M5AhRGfCo?Y<$PLIZa8J!k-Q2@Al?MPKI~&>n diff --git a/src/__pycache__/somatic.cpython-37.pyc b/src/__pycache__/somatic.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45d75ddbc5fd79b2ba105345754cf0ac1b5dd8e8 GIT binary patch literal 12829 zcmcIq-H%(xb-!OEFE2lq%jJGbT2ZoWYh{VM@>d*3btPGm6UW}ll4_;5b!aZdH6;vwKd^R z=+u|lD%_6xoOFwbTdm2hDR&Ba36cD==9WZC*zl)CS~&1$L`G!c&x)ML!=Doqq5ywh zOo}4>1u-Q`@Q;aUF$4d&m=$yIPl$Q30Dn;&i`zLaP9T0#EQ*ux%c3k!!9OMRTiVj; z4~R#rOL~}jyCxg8c5uDX@(|U}!Le?1I;}c!kNy()EPSux^WT6IXiD4Db@Fxg^?{*u zq0b_HR@lS=drVf4-2Ib{s2eeIg|PPT6;Q<$?_-`sxbj;2gdVy*V_%Rz0ev0J{- zE#IrJmn9}k`+%IKM40HxMmq?Njvprd2Y%pfNun!Etb0Lya|!d*XroeSb!)+9Xj0Iu zqk<{D`gb*%N7+YLm!&W3o6jx#by-8L<+b|Sa#M76E+XYhyYBgce6ajxr`_pxHoSKA z-o?rb%c!yJceZLlqh9Gg2<`igVDr}M+x{$C^b2q_-8O8!sN1@w=k=^EDX~su(;xXJ zK4#hs0wbmovzSRj!|?UhB~vDmFH_{$a4-R8eM>Oyku!nRFe!K1ufBQhG{Qbj2m04# z5&meRh%wXv_jv|^2aeJPn52D7xupgAT}LkJAPtKTDWAu(1O>)wXs>pH>z$ppSTe%Y zh8I+W+D2%9&U^4_DLWF0mh@^Ow2@qG*S5UFq~&F2ZO0F)an-Zv+9yIZ-OwE{N6sNU zVh!b_@iCeAz^FiLGEIh>|2%4P>S;qdCt3yao0(I_%p2Cbod7Uh_7kwdD5lsr<1c5{?6fs|A{ro=ud z_VXB10b@dJY`jU%JBPeUtP1H{=9yk@}hdpKc;=iur^X)!e>$N4tXw ze@nYLocPje@Acc8UU|clTdhXhD|dE+?oLo%@3e#`%fH_**K6%|Cn&FZNWk zYQ9&-N<65w)e$=_uhRQGwI#icMyFj~hcNkB4ftMA4!G-fXRA?+XhPlO zMyrasbDfAc6h17iPTZ@t8lo0>)g2IC#aaxoOses^TgLc&5}ul|s4p6Mh%b`vHoh{1 znXS)2<`?zj{FBe3+(`b?(xi_C+a(0B8MT2<^0#jU5J!7@0JyQI-%ZPxpdyXH+B5u$ zGK3CcWCRI>j6H({7v!?J$_rkeKtoH(&}y{TJD~{?8(P7Gt`|BDztQ%C8uS;mVXFg{ zCnO7TQ|3@dJ_pB5QMIZF4l^N-hCWMKk?6jTvVInhW@PmOkJHiRClMcsZOR>q?f*lB zbPL2Q6hFp?Q3``b0pEmg!iNxr*fg5vz*5$}E)paj<9H;JF=7a110vECNr+4XA`=3* zpF*63FU8Y{+bSU(gh|*Uodm5yc^0ZvJ{?dC5Vl+cB32akjIW>TJIWcPR9Z|%u!Lg? zQA9b^8m8Vk5Q|d^;AFg=ekQP0W{)5Zl!?la_-CzIT79Iy5={`xCYq&QdfBgS1;9@J zP|%cyf+oNshCmY}aj0B`ES7uILrD;=-^ymavgV0ivV5=M-7oj-@`bf>`NA9JM|!zu zl`nM4y{Y9}b!?$v87+N!H;}b@Q02qYhq}{Cl^bp0?Us9qwVJxWhrK)Fc2Z_>Os z&9OXMg$H>AXUA#*@+KNk_`C0_#2~?t_jW%O0eY2cCI=}>0|=p=Vq;~cnrXxU7C7we z0YXP>x`~w@hq6?f*0{1&Ql&3x+wa}cg7Ka~6MA-n+=ur7*6(;K_5tmTe)h3c><3QG zJ(fBZr{+beKLKF%9qb++6TsCsu|reM{9r;%WA7J&f=Y=Qv^&Y6S%iulnnP%cL-Pog zIJAJ!G>48MG{d3e2+eZn1VVGaC}+fCe_o|k@^12>A*nX_G>>E0Jn{M@b2mzXWFj8- z=S6uiN!-Wh;LG4+KEuz8Q}dcCP;I!V0T+@C?rFI5s;DN_lq#udHKS(L+H{byNlLm`9Hzt~7#SZWmXDCu-H!hONmSST?8O&Dkv`S!J|p?UKLoHyTsa}|#D z=3BQAy!9FfSFVS~>Xk>j>b>~;{q!&X?}Wy+8*%}?en9-~jhb+JNp8(qnm&;FI72^x za1Jwk$_9#pcHl#_By}F9kk@T_IA1g0{m=wN3lof^LleWJvosFawN~J>hG1YJpGRE= z2=Ww#E>ly9daDCSMIw3WxV%E?SIK#T9MTuA34MW%>ICLuY=X&WxbivVhekuV=?KD6 z2`3JU?(^k95~eQ> zVGiRECNd6THr+H=r74XQF|VVc39PA#+L0m2JXKL=RnsH zx-79>VNPGsvqlN1lYfQ<>`8j?@!tx{I{F#pl%N;$x?{cw9SLp8U!*^6Q#R4Z2(6NW zAK|&nh+tR7CL96W1w0CExTk-85hk1&Wx{-uhxx`D==*fYP2AExbf7d>hp^}H`D)3C zRt_PNrKBYF9N~u~*n~rZyg(@xI4*#H1V$bQ2l6+OIl$-BX44EB8KH4bA4wdtx{zRC z{a-*p!P0Tpl3Ahm5oT*gbe5#n$1pgghu_4L1)Cf z@BLhy0qjT^0tNxb7eEZqnpR-pfPRq5r~|T{5d=^~3g>}?TtdBRVA^)y-d$HVX5$M$ z1ZhIx?CWHj00u}ZC)RsF5FCqj9%E-mWZ6(bINA|8#uXXTLn?ELQu5GnDGG1P@!1Vh z(5Y54-2_4b(#SqW8KLbIC_4|z&K#5jg(bo)1GFy+5&njTDvh^`+$`xTj5Y^TVg8}9 z>tcoF2qir<-g%=xp(db@iQ0+gM2v$97}Z3x0L>naaH2WMxQR-kpMFM|Do0}w+J>&O zWNs{J{YjN!e4xvlG+K;vQWz)$q@wa{lFZY{ff^?78qnEmy8(J}21Q13>-{M;C5jQI zh(q8jdMm{-@HGxaT%8Vp5x}bw{Au{ps>r<*K|Rt1&=o?+0o9YLcnKz}?f<-^0XJrZ zGbpLaeVnSG#VKCkP()n`)R|KIHBtwfV(LKWh&p3ieGKZRK^-{!Yov~NH}fIhncE9L zi#qTY<}dJ)R~uGXCKiTu~QYlVHj3aP*>Dk)KECk-j{x&gJhREi!^z8@)z1 zlC1Ai;?r=*M1mt)auXI?7*xJM>6gjz$f1s9og6_9!f@ItxG2cs9Q0;V$R}a}K(45mtK2 zD?*2skq+GmEopJ6fEsfM(QlD;WghO&PbUu3Vl4DPq^z86w0wrx;X`FQi`n-WLtp&7#0#+7%*civK=wDcMh*?UVr1u z`=eKL&%TMf5*QiEy}9K|g)c+ixCR&q>)V#s4k|axW%((Law&0uGY?;5q>o%e*HDXX z127=EHtiq=0Avpc$z%Wv^x=umj(7dPDCm>^9nL<@MSk=Vw2E@A~Pfo@Q zD!ZCecGIT#h9L@K@_oLcqRra`9HUaA$SqP2iBS&;y7Q&`Ub^e0dtTh};&zvAcIj4E zoEA@rGaE&fsHSkEl@d>qnPZSe*ly-%Ex{0)Z03Ur#344om#5~nLE#9RpNCg-KdTbWq5{+zOa&#hno~J(9HsIiJ(yNGN)Z-7TV9-H5Oqc@0fJ=! zzY6F%Pc;F=CI>V7S~J%#?!K;yfQeTDt|pN?ghE3slAsWhez*a*}sFes_|?R%97*z~3jQjq%DYn8fxuU9%qBoQ8`9$`t5UGxIO zj_G?{xkEjEwAPVrxB5{on9yu#L6W|Vq&srPNWkgq&k#TW(g^}RW z{2`?p8eQDJ(;a*j%}Ag6JVVYmsN>WIu6?>-Ok%wu0sRu)#=Q=V^kkjO;_(E@V!lX? z=rmy@lL1vs1|4b-mvNglTqs!$d;iVK_r9i6e3swL+fKY8^0;=b1yHiR$t`vfToVGqP3d!afsCv72mHsSzeUy3w*| zdT<@ss%^*z)TR-1BMi&%kyZ-XVz4Z#_q_TcWli^rTLdEU07vsl^Rdw-L7)#mgNzCV zy=-OyDhv3XGvw-V&~7LZy;~rSMi+24Br0h(7{lq}4MqsOF@PpBpk0!}5gBNq?E3~A^Am{S zDnjHb9wY-Y@Y{6Uhd!COqy;dQ3y1Uy^qqr4y@i(2IJi4OdXEm>RzEA!EdTRC7TS?6 z;C|4b*!?#(0U7^K&;SKCR*S!xAFdmPItRg~UO_UF`` za#R+k4;cT!vn-@I`FS?z&yw8S{=s7gi&S&!hpMpzQNao7EChpFgMJLS@4ZG zi3>#Vah~|7=8tS)?w|#5asl5le8=&fz_*C+B)%-ZGPqyDabX_F;#4p_$R0t@0$MwU z{E3I=?)TLxXm9hKf9anNX4GjxS1nJdC&UTXle_A4b2h%lUEF63L%GW4tCZJapt02x zn5#2w3)=2=D&2n)?p*&Vxbyv|;V$&gVjdh-23Jq=3RzMM%tso>IegFHJCE-IcySi< zRzZGw25;e*hMxw<#3{6}K=VhK4keex>HZ4LLEnavq)f9i2R*w7BP7g1U+Z5)Zx^H9 z`p?GUC$L(ci^FFSeqKGRp2qcyDNAaZdotzQ>SDi)7|roA<*Ul9c2|Fxkbj}haq2nH zu`*gw&k{xbmAF-)s{RY=1)-_jSAdNU`oc3XO75=f&Xo-~>|7?h_{XiF$@x ze@0CV7SY2+^nj~4gwF7qeJVDOCAPoC^kBD%80K1OX|hP;-6G!RYlk;SJH^Ts8``~FEbS{sh2^$CI9~8QQv-^ zYkiVyCFD0h9_{UaP@e#`GzSlfi&!Bq?CC>lIsRfC2emjgF}1{%4^f(s?yr?nTV3GU zxX?CkU`LTD6RXC$Yk~HERL3~)7$}HRzORmR%5m@^O8N09@5kyS=baqo{Y0JMloRNK zN*nTZwa6)pwC~W6%~n+5*pRBE{aJf?aZcmmIpY!!-d} zj-tB)GEu?Y)Cn5vRoqC)KcSpIg>y6zMRx?r+yn#GBN1!}6Yqz?E+f#;c=ak907AXW zz@l;a>V($I`}7=+9+VLfE6cAE4I@J2-x4)udc}cr(OD{wLAXDslHY_Armw*wc3tAF zMVRqBGQb59t~z}@+^Mzk1d8l;5km-{v7s+aUk4zoU8hSDf_$L~L>HOy=(sH3r8G-4 z>M;4?ZVgP8e@RKXE3L*xn`|{)l_4X&!VH~l+^s&~XPjXk@Ys`HyY5BtiGw)$lz7_} z74|*&D8&{Ya1Jl9d=Kx%HW>=CWpU|K2i6`Jw>or!z!R``m7aj$s;#a+6yRh;42*2CN;jX}S^0Mzb!MLwK7IDRtxKIDWs;5*#gXbZaNT zL$WAkl2h89D{R!HLSZV#vu=joAIb>R%DbR2)O|NQ&Iz|T9*fa0EG3|Up4_@=JX+fT z^ERcMiGn;u!*m*DY6}mfF>Nl6Jwwvns0kw5&5n~Ck7r13e7vCdcos1JT{lf^;n7R7 zwMOj5-g7g|g4l9PdytpD8_!JJ+XxfFYhkYQqy35(LZB*+qeK+9X;LWXFh)-jM&IZW zF!q0n4gk^9u~pV`VACzo9VkFKfoQVrT7c`3<(FRRkv(t*-&p``3fuZz@-+Je{WRNx zxlU9QuieT>Oa}(ok~x)bWNq|AM~i}98c(wv!$vw;t*M=1IByc&^4~G`rPDa@$_?GVr!=r;C6xQQb)#CSCXlY-?{U{bGCf)6UgWxz-K78 zNe*4s@@;~Ak3wG}hs6~gcz9T2b58cb$S`PMiXIcc0&}0{zXB(kz@yw~qO+Erv6FVn Vp0yo&#(o)Y!oF@7k(RV?{67R23)%nx literal 0 HcmV?d00001 diff --git a/src/__pycache__/utils.cpython-37.pyc b/src/__pycache__/utils.cpython-37.pyc index abcf3b2a9edee06a5d7d661a63250a3fe6f02305..39b2b76472b32a0ddba9bf6082a4961afb849389 100644 GIT binary patch delta 74 zcmcazbft*LiI1: + tp['RG']= [tp['RG'][0]] + tp['RG'][0]['SM'] = cell + tp['RG'][0]['ID'] = cell + cnt = 0 + outfile = pysam.AlignmentFile( out + "/Bam/split_bam/" + cell + ".bam", "wb", header=tp) + for s in infile: + t = robust_get_tag(s,"CB") + if not t=="NotFound": + if t==cell: + outfile.write(s) + cnt = cnt +1 + else: + if re.search(cell, s.query_name): + outfile.write(s) + cnt = cnt + 1 + + outfile.close() + infile.close() + cmd=samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam" + + os.system(samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam") + + return(cnt) + + + + +def jointCall(para): + + para_lst = para.strip().split(">") + jobid = para_lst[0] + chr=para_lst[1] + out = para_lst[2] + app_path = para_lst[3] + reference = para_lst[4] + samtools = app_path + "/samtools" + bcftools = app_path + "/bcftools" + bgzip = app_path + "/bgzip" + bam_filter = out + "/Bam/split_bam/cell.bam.lst" + cmd1 = samtools + " mpileup -b " + bam_filter + " -f " + reference + " -r " + jobid + " -q 20 -Q 20 -t DP4 -d 10000 -v " + cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + " norm -m-both -f " + reference + cmd1 = cmd1 + " | grep -v \"\" | grep -v INDEL |" + bgzip + " -c > " + out + "/somatic/" + jobid + ".cell.gl.vcf.gz" + print(cmd1) + output = os.system(cmd1) + + # delete the bam files once snv calling was finished in specfic regions + f = open(bam_filter, "r") + for x in f: + x = x.strip() + #os.system("rm " + x) + #os.system("rm " + x + ".bai") + f.close() + + if output == 0: + return(jobid) diff --git a/src/haplotype_somatic.R b/src/haplotype_somatic.R new file mode 100644 index 0000000..33094ec --- /dev/null +++ b/src/haplotype_somatic.R @@ -0,0 +1,101 @@ + + +library(e1071) +library(reshape2) + +source("/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/src/svm_phasing_sub.R") + +args = commandArgs(trailingOnly=TRUE) + +chr <- args[1] +loaddata <- args[2] +dis_limit_physical <- as.numeric(args[3]) +dis_limit_genetic <- as.numeric(args[4]) + + + + + +print(chr) + +if(loaddata==1){ + print("Loading data now...") + mat <- read.table(paste0(file="../germline/chr",chr,".gl.filter.hc.cell.mat.gz")) + #normal <- read.table(file=paste0("normal.",chr,".snv")) + #tumor <- read.table(file=paste0("tumor.",chr,".snv")) + #somatic <- read.table(file="somatic.snv") + feature_info <- read.csv(paste0(file="../debug/chr",chr,".gl.feature.info"),header=F) + + + mat$V2 <- paste0(mat$V1,":",mat$V2) + meta <- mat[,c("V1","V2","V3","V4")] + meta <- as.data.frame(meta) + meta$in_normal <- 0 + meta$in_tumor <- 0 + meta$somatic <- 0 + #meta$in_normal[meta$V2%in%normal$V1] <- 1 + #meta$in_tumor[meta$V2%in%tumor$V1] <- 1 + #meta$somatic[meta$V2%in%somatic$V1] <- 1 + meta$baf <- frq(mat) + + #meta$baf <-0.5 + # SVM module on remove low quality mutations + + + svm_info <- feature_info[feature_info$V1%in%meta$V2,] + svm_dt <- getFeatureInfo(svm_info=svm_info) + + overlap <- intersect(meta$V2, svm_dt$id) + + meta_qc <- meta[meta$V2%in%overlap,] + mat_qc <- mat[meta$V2%in%overlap,] + svm_dt_qc <- svm_dt[svm_dt$id%in%overlap,] + svm_dt_qc$BAF <- meta_qc$baf + save.image(file=paste0(chr,".input.image.RDS")) +} + +if(loaddata==2){ + load(file=paste0(chr,".input.image.RDS")) +} + + +source("/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/src/svm_phasing_sub.R") +print(args) + + +print("start to run SVM and haplotype filtering") +print("new version ") + +# get putative mutation block +#meta_qc$flag <- paste0(meta_qc$in_normal, ":", meta_qc$in_tumor, ":", meta_qc$somatic) + + + + + +mutation_block <- SNV_block(summary=meta_qc) + +svm_in <- SVM_prepare(mutation_block) + +svm_out <- SVM_train(label =svm_in, data=svm_dt_qc, downsampling=2) + +p_lower <- sort(svm_out$test$POS)[floor(nrow(svm_out$test)*0.1)] +p_upper <- sort(svm_out$test$POS)[floor(nrow(svm_out$test)*0.95)] +filter1 <- (svm_out$test$POS>p_lower) +filter2 <- (svm_out$test$baf>0.1 ) +svm_pass <- svm_out$test[filter1 & filter2,] + + +svm_phasing <- Phasing(x=mat_qc, snv_meta=svm_pass, dis_limit=dis_limit_genetic, readlength=dis_limit_physical) + +svm_out$phasing <- svm_phasing +svm_out$feature <- svm_dt_qc + +saveRDS(svm_out,file=paste0("svm",chr,".out.RDS")) + + + +x=mat_qc +snv_meta=svm_pass +dis_limit=dis_limit_genetic +readlength=dis_limit_physical \ No newline at end of file diff --git a/src/index_motif.py b/src/index_motif.py new file mode 100644 index 0000000..e69de29 diff --git a/src/somatic.py b/src/somatic.py index ce40f21..8c49357 100644 --- a/src/somatic.py +++ b/src/somatic.py @@ -14,6 +14,7 @@ import numpy as np import gzip from pysam import VariantFile +from bamProcess import * import multiprocessing as mp from multiprocessing import Pool @@ -31,9 +32,9 @@ def withSNVs(invcf, path): return(cnt) def runCMD(cmd): - + print(cmd) os.system(cmd) - #process = subprocess.run(cmd, shell=True, stdout=open(args.logfile, 'w'), stderr=open(args.logfile,'w')) + #process = subprocess.run(cmd, shell=True) def robust_get_tag(read, tag_name): @@ -59,14 +60,10 @@ def validate_user_setting_somatic(args): exit(1) bam_filter = args.out + "/Bam/" + record[0] + ".filter.bam.lst" gl_vcf = args.out + "/germline/" + jobid + ".gl.vcf.gz" - gp_vcf = args.out + "/germline/" + jobid + ".germline.vcf.gz" phased_vcf = args.out + "/germline/" + jobid + ".phased.vcf.gz" assert os.path.isfile(bam_filter), "The bam list file {} cannot be found! Please run germline module".format(bam_filter) assert os.path.isfile(gl_vcf), "The *.gl.vcf.gz file {} cannot be found! Please run germline module".format(gl_vcf) - #if withSNVs(gl_vcf, args.app_path)==0: - # print("The *.gl.vcf.gz file {} cannot be found! Maybe there are no markers detected in ".format(gl_vcf)) - #if withSNVs(phased_vcf, args.app_path)==0: - # print("The *.phased.vcf.gz file {} cannot be found! Maybe there are no markers detected in this region?".format(phased_vcf)) + assert os.path.isfile(phased_vcf), "The *.phased.vcf.gz file {} cannot be found! Please run germline module".format(phased_vcf) assert os.path.isfile(args.barcode), "The cell barcode file {} cannot be found!".format(args.barcode) @@ -80,11 +77,36 @@ def getInfo_robust(rec, info): return(info_dt) -def featureInfo(para): +def bamExtract(para): + para_lst = para.strip().split(">") + chr = para_lst[0] out = para_lst[1] - region = para_lst[0] + app_path = para_lst[2] + samtools = os.path.abspath(app_path) + "/samtools" + out = os.path.abspath(out) + inbam = getBamName(chr, out) + outbam = out + "/Bam/" + chr + ".filter.targeted.bam" + # remember to update bed file ls -lrt + + #os.system("cat " + out + "/somatic/" + chr + "*.gl.vcf.filter.hc.bed > " + out + "/somatic/" + chr + ".bed") + chr_bed = out + "/somatic/" + chr + ".gl.vcf.filter.hc.bed" + cmd1 = samtools + " view " + " -b -L " + chr_bed + " " + inbam + " -o " + outbam + with open(out+"/Script/bamExtract_" + chr + ".sh","w") as f_out: + f_out.write(cmd1 + "\n") + f_out.write(samtools + " index " + outbam + "\n") + cmd="bash " + out+"/Script/bamExtract_" + chr + ".sh" + print(cmd) + os.system(cmd) +def featureInfo(para): + + para_lst = para.strip().split(">") + region = para_lst[0] + out = para_lst[1] + app = para_lst[2] + pysam.tabix_index(out + "/germline/" + region + ".phased.vcf.gz", preset="vcf", force=True) + pysam.tabix_index(out + "/germline/" + region + ".gl.vcf.gz", preset="vcf", force=True) vcf_in = VariantFile(out + "/germline/" + region + ".phased.vcf.gz") info_GT = {} for rec in vcf_in.fetch(): @@ -102,13 +124,13 @@ def featureInfo(para): for rec in gl_vcf_in.fetch(): info_I16 = rec.info.get('I16') info_QS = getInfo_robust(rec,'QS') - info_VDB = getInfo_robust(rec,'VDB') + info_VDB = getInfo_robust(rec,'VDB') info_RPB = getInfo_robust(rec,'RPB') info_MQB = getInfo_robust(rec,'MQB') info_BQB = getInfo_robust(rec,'BQB') - info_MQSB = getInfo_robust(rec,'MQSB') + info_MQSB =getInfo_robust(rec,'MQSB') info_SGB = getInfo_robust(rec,'SGB') - info_MQ0F = getInfo_robust(rec,'MQ0F') + info_MQ0F =getInfo_robust(rec,'MQ0F') id = str(rec.chrom)+":"+str(rec.pos) + ":" + rec.ref + ":" + rec.alts[0] gt_info_var = "NA" if (id in info_GT): @@ -126,6 +148,14 @@ def featureInfo(para): b = "{}\t{}\n".format(rec.chrom, rec.pos) gl_vcf_filter_txt.write(b) gl_vcf_filter_dp4.write(a) + gl_vcf_filter_dp4.close() + gl_vcf_filter_txt.close() + gl_vcf_filter_bed.close() + gl_vcf_dp4.close() + + + bamExtract(para) + def getBamName(chr, out): #chr = region.split(":")[0] @@ -135,97 +165,6 @@ def getBamName(chr, out): bamName = line.strip() return(bamName) -def bamExtract(para): - - para_lst = para.strip().split(">") - - chr = para_lst[0] - out = para_lst[1] - app_path = para_lst[2] - samtools = os.path.abspath(app_path) + "/samtools" - out = os.path.abspath(out) - inbam = getBamName(chr, out) - outbam = out + "/Bam/" + chr + ".filter.targeted.bam" - # remember to update bed file ls -lrt - - os.system("cat " + out + "/somatic/" + chr + "*.gl.vcf.filter.hc.bed > " + out + "/somatic/" + chr + ".bed") - cmd1 = samtools + " view " + " -b -L " + out + "/somatic/" + chr +".bed " + inbam + " -o " + outbam - with open(out+"/Script/bamExtract_" + chr + ".sh","w") as f_out: - f_out.write(cmd1 + "\n") - f_out.write(samtools + " index " + outbam + "\n") - cmd="bash " + out+"/Script/bamExtract_" + chr + ".sh" - runCMD(cmd) - - -def bamSplit(para): - para_lst = para.strip().split(":") - chr = para_lst[0] - cell = para_lst[1] - out = para_lst[2] - app_path = para_lst[3] - - #assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter) - samtools = app_path + "/samtools" - output_bam = out + "/Bam/merge.filter.targeted.bam" - infile = pysam.AlignmentFile(output_bam,"rb") - # Note to change the read groups - tp =infile.header.to_dict() - if len(tp['RG'])>1: - tp['RG']= [tp['RG'][0]] - tp['RG'][0]['SM'] = cell - tp['RG'][0]['ID'] = cell - cnt = 0 - outfile = pysam.AlignmentFile( out + "/Bam/split_bam/" + cell + ".bam", "wb", header=tp) - for s in infile: - t = robust_get_tag(s,"CB") - if not t=="NotFound": - if t==cell: - outfile.write(s) - cnt = cnt +1 - else: - if re.search(cell, s.query_name): - outfile.write(s) - cnt = cnt + 1 - - outfile.close() - infile.close() - cmd=samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam" - #runCMD(cmd,args) - os.system(samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam") - - return(cnt) - - - - -def jointCall(para): - - para_lst = para.strip().split(">") - jobid = para_lst[0] - chr=para_lst[1] - out = para_lst[2] - app_path = para_lst[3] - reference = para_lst[4] - samtools = app_path + "/samtools" - bcftools = app_path + "/bcftools" - bgzip = app_path + "/bgzip" - bam_filter = out + "/Bam/split_bam/cell.bam.lst" - cmd1 = samtools + " mpileup -b " + bam_filter + " -f " + reference + " -r " + jobid + " -q 20 -Q 20 -t DP4 -d 10000 -v " - cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + " norm -m-both -f " + reference - cmd1 = cmd1 + " | grep -v \"\" | grep -v INDEL |" + bgzip + " -c > " + out + "/somatic/" + jobid + ".cell.gl.vcf.gz" - print(cmd1) - output = os.system(cmd1) - - # delete the bam files once snv calling was finished in specfic regions - f = open(bam_filter, "r") - for x in f: - x = x.strip() - #os.system("rm " + x) - #os.system("rm " + x + ".bai") - f.close() - - if output == 0: - return(jobid) def less1(num): @@ -314,4 +253,283 @@ def LDrefinement(para): print(cmd) output = os.system(cmd) if output == 0: - return(region) \ No newline at end of file + return(region) + + + + + + +def robust_get_tag(read, tag_name): + try: + return read.get_tag(tag_name) + except KeyError: + return "NotFound" + +def rev_compl(st): + nn = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + return "".join(nn[n] for n in reversed(st)) + + + +def bam2mat(para): + + para_lst = para.strip().split(">") + print(para_lst) + + # bam2gz(para); + + #### gz to matrix ### + mat_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat.gz" + snv_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.snvID.csv" + cell_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.csv" + meta_info = para_lst[1] + "/somatic/" + para_lst[0] + ".gl.vcf.filter.DP4" + + cell_clst = pd.read_csv(cell_infile) + snv_clst = pd.read_csv(snv_infile) + mat = pd.read_csv(mat_infile, sep="\t", header=None) + mat_out = gzip.open(para_lst[1] + "/somatic/" + para_lst[0] + ".gl.filter.hc.cell.mat.gz","wt") + + + mat.columns = ['snvIndex','cellIndex','allele'] + mat=mat.groupby(by=['snvIndex','cellIndex'], as_index=False).first() + mat=mat.pivot(index='snvIndex', columns='cellIndex', values='allele') + meta_info = pd.read_csv(meta_info, sep="\t", header=None) + meta_info.columns=["chr","pos","ref_allele","alt_allele","Dep","dep1","dep2","dep3","dep4","genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F"] + + overlapSNV_index=set(snv_clst["index"]).intersection(set(mat.index)) + + cell_clst.loc[list(mat.columns)]['cell'].to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.filter.csv") + + ## meta_info_id is similar with snv_clst + n_cell = mat.shape[1] + for index in overlapSNV_index: + info = meta_info.iloc[index].astype(str) + geno = info["genotype"] + info = "\t".join(info) + + flag = 0 + if geno=="0|1": + phase_info = ["0|0"]*n_cell + tp = mat.loc[index] + pos = tp.dropna().index + for value in pos: + allele = tp.loc[value] + index_vec = tp.index.get_loc(value) + if allele==1: + phase_info[index_vec]="0|1" + elif allele==0: + phase_info[index_vec]="1|0" + flag = 1 + elif geno=="1|0": + phase_info = ["0|0"]*n_cell + tp = mat.loc[index] + pos = tp.dropna().index + for value in pos: + allele = tp.loc[value] + index_vec = tp.index.get_loc(value) + if allele==1: + phase_info[index_vec]="1|0" + elif allele==0: + phase_info[index_vec]="0|1" + flag = 1 + elif geno=="nan": + phase_info = ["0/0"]*n_cell + tp = mat.loc[index] + pos = tp.dropna().index + for value in pos: + allele = tp.loc[value] + index_vec = tp.index.get_loc(value) + if allele==1: + phase_info[index_vec]="0/1" + elif allele==0: + phase_info[index_vec]="1/0" + flag = 1 + if flag: + mat_out.write(info+"\t" +'\t'.join(phase_info)) + mat_out.write('\n') + + + + +def bam2gz(para): + + para_lst = para.strip().split(">") + + #assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter) + #in_fasta = "/rsrch3/scratch/bcb/jdou1/scAncestry/ref/fasta/genome.fa" + #in_vcf = "/rsrch3/scratch/bcb/jdou1/bam_process/chr1.gp.vcf" + #in_bam = "/rsrch3/scratch/bcb/jdou1/bam_process/chr1.filter.targeted.bam" + + # joblst.append(id+">"+args.out+">"+args.reference) + + in_snv = para_lst[1] + "/somatic/" + para_lst[0] + ".gl.vcf.filter.DP4" + in_bam = para_lst[1] + "/Bam/" + para_lst[0] + ".filter.targeted.bam" + in_fasta = para_lst[2] + in_cell_barcode = para_lst[3] + + # read cell barcode file + + cell_clst = pd.read_csv(in_cell_barcode) + df = pd.DataFrame(cell_clst, columns= ['cell','id']) + df = df.sort_values(by=['id'], ascending=False) + df['index']=range(len(df.index)) + cell_set = list(df["cell"]) + + + ref_fa = pysam.FastaFile(in_fasta) + snv_info = {} + tp = list() + index = 0 + motif_len = 3 + snv_tol = 0 + + # index 0-based + with open(in_snv,'r') as fp: + for line in fp: + line = line.strip() + line = line.split("\t") + chrom = line[0] + pos = int(line[1]) + ref = line[2] + alts = line[3] + seq = ref_fa.fetch(chrom, pos-4, pos+3) + seq_rev_compl = rev_compl(seq) + id = str(chrom)+":"+str(pos) + ":" + ref + ":" + alts[0] + mydict = dict(id=index, chr = chrom, pos = pos, motif_pos= seq, ref_allele = ref, alt_allele=alts[0]) + snv_info[index]=mydict + index = index + 1 + snv_tol = snv_tol + 1 + tp.append(id) + + + + #print(snv_info_out) + ## output cell snv meta information + df.to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.csv", index = False) + snv_info_out = pd.DataFrame() + snv_info_out['snvID'] = tp + snv_info_out['index'] = range(len(tp)) + snv_info_out.to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.snvID.csv",index = False) + + read_tol = 0 + read_cover_tol = 0 + read_wild_tol = 0 + read_mutated_tol = 0 + read_noAllele_tol = 0 + overlap = 0 + index = 0 + lower_index = 0 + + infile = pysam.AlignmentFile(in_bam,"rb") + fp = gzip.open(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat.gz", "wt") + # fp = open(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat", "wt") + for s in infile: + + cell_barcode = robust_get_tag(s,"CB") + if cell_barcode=="NotFound": + ### no cell barcode file detected. + ### check whether the barcode added in the query name + cell_barcode = s.query_name.strip().split("_")[1] + + if cell_barcode in cell_set: + cell_barcode_index = cell_set.index(cell_barcode) + else: + ### cell barcode is not detected, should move to next read + continue + + read_name = s.query_name + align_chr = s.reference_name + align_start = s.reference_start + align_seq = s.query_sequence + mystart = str(snv_info[lower_index]["pos"]) + + + ### flags used to see whether SNVs covered in different scenarios + read_tol = read_tol + 1 + read_cover = 0 + read_wild = 0 + read_mutated = 0 + read_noAllele = 0 + read_len = len(align_seq) + + + ### get the SNVs locating in [align_start, align_start + read_len] ### + if read_tol%1000000 == 0: + print("scanning read " + str(read_tol)) + + lock = 0 + snv_cover = "" + + + for i in range(lower_index,snv_tol-1,1): + + snv_pos = snv_info[i]["pos"] + wild_allele = snv_info[i]["ref_allele"] + alt_allele = snv_info[i]["alt_allele"] + + if snv_pos >= align_start: + if lock==0: + lower_index = i + lock = lock + 1 + if snv_pos <= align_start + read_len: + read_cover = read_cover + 1 + snv_cover = str(snv_cover) + ":" + str(snv_pos) + motif_pos = snv_info[i]["motif_pos"] + motif_neg = motif_pos[:motif_len] + snv_info[i]["alt_allele"] + motif_pos[motif_len + 1:] + + if re.search(motif_pos, align_seq): + read_wild = read_wild + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n") + elif re.search(motif_neg, align_seq): + read_mutated = read_mutated + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n") + else: + delta = snv_pos - align_start + if read_len - delta < motif_len: + motif_pos_part = motif_pos[0:motif_len+1] + motif_neg_part = motif_neg[0:motif_len+1] + seq_part = align_seq[read_len-2*motif_len-1:read_len] + + if re.search(motif_pos_part, seq_part): + read_wild = read_wild + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n") + elif re.search(motif_neg_part, seq_part): + read_mutated = read_mutated + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n") + + elif delta <= motif_len: + motif_pos_part = motif_pos[motif_len:len(motif_pos)] + motif_neg_part = motif_neg[motif_len:len(motif_neg)] + seq_part = align_seq[0:2*motif_len+1] + + if re.search(motif_pos_part, seq_part): + read_wild = read_wild + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n") + elif re.search(motif_neg_part, seq_part): + read_mutated = read_mutated + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n") + else: + #fp.write(str(snv_info[i])+"\n") + #fp.write(str(align_chr) + ":" + str(align_start) + ":" + align_seq+"\n") + read_noAllele = read_noAllele + 1 + else: + break + + if read_cover > 0: + read_cover_tol = read_cover_tol + 1 + if read_wild > 0: + read_wild_tol = read_wild_tol + 1 + if read_mutated > 0 and read_wild>0: + overlap = overlap + 1 + + if read_mutated > 0: + read_mutated_tol = read_mutated_tol + 1 + if read_noAllele > 0: + read_noAllele_tol = read_noAllele_tol + 1 + + + infile.close() + fp.close() + print(str(read_tol) + ":" + str(read_cover_tol) + ":" + str(read_wild_tol) + ":" + str(read_mutated_tol) + ":" + str(overlap) + ":" + str(read_noAllele_tol)) + return(para_lst[0]) \ No newline at end of file diff --git a/src/svm_phasing_sub.R b/src/svm_phasing_sub.R new file mode 100644 index 0000000..317d520 --- /dev/null +++ b/src/svm_phasing_sub.R @@ -0,0 +1,268 @@ + +frq <- function(x=mat) { + res<-matrix(0,nrow(mat),1) + for(i in seq(1,nrow(mat),1)){ + geno <- mat[i, seq(5,ncol(mat),1)] + geno <- gsub("/","|", geno) + geno <- geno[!geno=="0|0"] + s <- strsplit(geno, "|") + result <- sapply(s, "[[", 1) + if(length(result)>10){ + + myfrq <- length( which(result=="0"))/length(result) + }else{ + myfrq <- 0 + } + if(myfrq>0.5){myfrq <- 1- myfrq} + res[i,1] <- myfrq + } + return(res) +} + + +getFeatureInfo <-function(svm_info=NULL){ + n_min <-(-1000) + dt <- data.frame("id"=svm_info$V1, "DP"=n_min, "QS"=n_min, + "VDB"=n_min,"SGB"=n_min,"RPB"=n_min, "MQB"=n_min,"MQSB"=n_min, "BQB"=n_min) + + svm_info[is.na(svm_info)] <-"-1000" + for(i in seq(1,nrow(svm_info),1)){ + for(j in seq(1,ncol(svm_info),1)){ + if(svm_info[i,j]=="DP"){dt$DP[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="QS"){dt$QS[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="VDB"){dt$VDB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="SGB"){dt$SGB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="RPB"){dt$RPB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="MQB"){dt$MQB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="MQSB"){dt$MQSB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="BQB"){dt$BQB[i] <- svm_info[i,j+1]} + } + } + return(dt) +} + +SNV_block <-function(summary=NULL){ + block <- 1 + last <- ".|." + summary$region <- 0 + for(i in seq(1,nrow(summary),1)){ + if(summary[i,4]==".|."){ + if(last==".|." | last=="0|0"){;} + else{ + block <- block + 1 + } + summary$region[i] <- block + } + last <- summary[i,4] + } + return(summary) +} + + +SVM_prepare <-function(x=NULL){ + svm<-list() + a<- table(x$region) + neg <- x[(x$region%in% names(a)[a>=4] & x$V4==".|.") | (x$V4=="0|0" & x$baf>0),] + #neg <- x[(x$region%in% names(a)[a>=3] & x$V4==".|.") | x$V4=="0|0",] + #neg <- x[x$V4=="0|0" | ,] + pos <- x[!x$V4==".|." & !x$V4=="0|0" &x$baf>0.4,] + svm$neg <- neg + svm$pos <- pos + svm$test <- x[x$region%in% names(a)[a<4] & x$V4==".|.",] + return(svm) +} + +SVM_train <- function(label=NULL, data=NULL, downsampling=20){ + + + print("Run SVM trainning") + + feature <-c("QS", "VDB", "SGB", "RPB", "MQB", "MQSB", "BQB", "BAF") + data$BQB <- as.numeric(data$BQB) + + data$VDB[data$VDB==(-1000)] <-0 + data$MQSB[data$MQSB==(-1000)] <-1 + data$QS <- abs(data$QS-0.5) + x_pos <- data[data$id%in%label$pos$V2,feature] + #x_pos <- x_pos[seq(1,nrow(x_pos),downsampling),] + x_neg <- data[data$id%in%label$neg$V2,feature] + + y_pos <- rep("POS", nrow(x_pos)) + y_neg <- rep("NEG", nrow(x_neg)) + + if(nrow(x_pos) > nrow(x_neg)){ + # randomly select matched variants between pos and neg + sel <- sample(nrow(x_pos))[seq(1,nrow(x_neg),1)] + x_pos <- x_pos[sel,] + y_pos <- y_pos[sel] + } + if(nrow(x_pos) < nrow(x_neg)){ + # randomly select matched variants between pos and neg + sel <- sample(nrow(x_neg))[seq(1,nrow(x_pos),1)] + x_neg <- x_neg[sel,] + y_neg <- y_neg[sel] + } + + print(paste0("pos->:", length(y_pos))) + print(paste0("neg->:", length(y_neg))) + + model <- svm(as.matrix(rbind(x_pos, x_neg)), as.factor(c(y_pos,y_neg)), probability=TRUE) + test <- data[data$id%in%label$test$V2,feature] + pred <- predict(model, as.matrix(test), probability=TRUE) + prob <- attr(pred, "probabilities") + prob <-as.data.frame(prob) + label$test$POS <- prob$POS + label$test$NEG <- prob$NEG + return(label) +} + +calcP <- function(geno_vec, dis_vec, dis_limit=20000){ + + dis_bin <- c(100, 1000, 5000, 10000, 20000, 500000, 1000000000000000000) + prob_bin <- c(1, 0.93, 0.78, 0.71, 0.69, 0.64, 0) + + prob_bin[dis_bin>dis_limit] <- 0 + + pos_match <- which(geno_vec%in%c("000","111")) + dis_match <- dis_vec[pos_match] + + match_p <- 0 + for(pos in dis_match){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + match_p <- match_p + p/sqrt(pos) + } + + pos_mismatch <- which(geno_vec%in%c("010","101")) + dis_mismatch <- dis_vec[pos_mismatch] + + + mismatch_p <- 0 + for(pos in dis_mismatch){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + mismatch_p <-mismatch_p + p/sqrt(pos) + } + + if((match_p + mismatch_p)==0){p <- (-1)} + else{ + p <- match_p/(match_p+mismatch_p) + } + + return(p) +} + +Phasing <- function(x = NULL, snv_meta = NULL, dis_limit = 20000, readlength=NULL){ + + start <- 1 + if(start){ + y <- x + dis <- colsplit(y$V1,":", c("id1","id2"))[,2] + dis_rd_study <-c() + trio_hap_rd_study <-c() + hap_rd2 <-c() + dis_rd2 <-c() + cnt <- 0 + + check_study <- snv_meta + check_study$prob <- (-1000) + check_study$tol <- 0 + check_study$dis <- (-1000) + + check_study$prob_phy <- (-1000) + check_study$tol_phy <- 0 + res<-c() + + pos_lst_study <-list() + for(i in seq(5,ncol(y),1)){ + vec <- y[,i] + pos_lst_study[[i]] <- which(!vec=="0|0" & !y$V4==".|." & !vec=="0/0") + } + } + + for(snv in snv_meta$V2){ + cnt=cnt+1 + #print(paste0("scanning ",cnt," now...")) + snv_index <- which(y$V2==snv) + snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",) + + # remove the first column since they are not genotypes + snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4] + geno_vec <-c() + dis_vec <-c() + dis_phy1 <- c() + dis_phy2 <- c() + # identify haplotypes for each element + index <-0 + for(pos in snv_index_noMisCol){ + # element index (snv_index, pos) + pos_noMisCol <- pos_lst_study[[pos]] + lower_vec <- which(pos_noMisColsnv_index) + if(length(lower_vec)>=1 & length(upper_vec)>=1){ + lower <- pos_noMisCol[max(lower_vec)] + upper <- pos_noMisCol[min(upper_vec)] + tp <- y[snv_index, pos] + tp <- gsub("/", "|", tp) + geno <- c(y[lower,pos], tp, y[upper, pos]) + + part <-colsplit(geno,"\\|",c("i1","i2"))[,2] + part[part>1] <- 1 + flag <- paste0(part[1],part[2],part[3]) + hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2])) + dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index]) + + dis_vec <- c(dis_vec, dis[upper]-dis[lower]) + dis_phy1 <- c(dis_phy1, dis[snv_index]-dis[lower]) + dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index]) + geno_vec <-c(geno_vec,flag) + } + } + + sum <- 0 + sum_equal <- 0 + if(sum(dis_phy10){ + p <- sum_equal/sum + check_study$prob_phy[cnt] <- p + check_study$tol_phy[cnt] <- sum + print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p)) + } + + if(length(geno_vec)>0 ){ + dis_rd_study <-c(dis_rd_study, dis_vec) + trio_hap_rd_study <-c(trio_hap_rd_study, geno_vec) + #print(table(trio_hap_rd)) + check_study$prob[cnt] <- calcP(geno_vec,dis_vec,dis_limit) + check_study$tol[cnt] <- sum(dis_vec