Add files via upload

KChen-lab · Feb 26, 2024 · 3e2aa6e · 3e2aa6e
1 parent cecf991
commit 3e2aa6e
Show file tree

Hide file tree

Showing 21 changed files with 1,091 additions and 245 deletions.
diff --git a/src/AlleleDropOut_germline.R b/src/AlleleDropOut_germline.R
@@ -0,0 +1,187 @@
+library(reshape2)
+calcP <- function(geno_vec, dis_vec, dis_limit=20000){
+
+		dis_bin <-  c(100,  1000,  5000,   10000, 20000,  500000, 1000000000000000000)
+		prob_bin <- c(1,    0.93,  0.78,   0.71,  0.69,   0.64,   0)
+
+		prob_bin[dis_bin>dis_limit] <- 0
+
+		pos_match <- which(geno_vec%in%c("000","111"))
+		dis_match <- dis_vec[pos_match]
+
+		match_p <- 0
+		for(pos in dis_match){
+				if(pos<=dis_bin[1]){p <-1}
+				else{
+					p <- prob_bin[max(which(pos>=dis_bin))+1]
+				}
+				match_p <- match_p + p/sqrt(pos) 
+		}
+
+		pos_mismatch <- which(geno_vec%in%c("010","101"))
+		dis_mismatch <- dis_vec[pos_mismatch]
+
+
+		mismatch_p <- 0
+		for(pos in dis_mismatch){
+				if(pos<=dis_bin[1]){p <-1}
+				else{
+					p <- prob_bin[max(which(pos>=dis_bin))+1]
+				}
+				mismatch_p <-mismatch_p + p/sqrt(pos) 
+		}
+
+		if(is.na(match_p) | is.na(mismatch_p)){p<-(-1)}
+		else{
+			if((match_p + mismatch_p) ==0){
+				p <- (-1)
+			}else{
+				p <- match_p/(match_p+mismatch_p)
+			}
+		}
+
+		return(p)
+}
+
+
+
+
+
+args = commandArgs(trailingOnly=TRUE)
+
+chr <- args[1]
+out <- args[2]
+len <- as.numeric(args[4])
+bin <- as.numeric(args[5])
+
+load(paste0(args[3],chr,".input.image.RDS"))
+
+
+
+germline <- mat_qc[!mat_qc$V4=="0|0" & !mat_qc$V4==".|.",]
+
+print(dim(germline))
+
+y <- germline
+dis <- colsplit(y$V1,":", c("id1","id2"))[,2]
+check <- germline[,c(1,2,3,4)]
+check$prob <- (-1000)
+check$tol <- 0
+check$dis <- (-1000)
+
+check$prob_phy <- (-1000)
+check$tol_phy <- 0
+
+pos_lst <-list()
+sta <- 0
+for(i in seq(5,ncol(y),1)){
+ 	vec <- y[,i]
+ 	pos_lst[[i]] <- which(!vec=="0|0" & !y$V4==".|.")
+ 	sta <- sta + length(which(!vec=="0|0" & !y$V4==".|."))
+}
+
+
+dis_rd <-c()
+trio_hap_rd <-c()
+cnt <- 0
+dis_limit <- 20000
+
+lst <- germline$V2
+dis_rd2 <-c()
+hap_rd2 <-c()
+
+for(snv in lst[seq(1,length(lst),bin)]){
+ 	  cnt=cnt+1
+ 	  #print(paste0("scanning ",cnt," now..."))
+ 	  snv_index <- which(y$V2==snv)
+ 	  snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",)
+
+ 	  # remove the first column since they are not genotypes 
+ 	  snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4]
+
+ 	  geno_vec <-c()
+ 	  dis_vec <-c()
+ 	  dis_phy1 <- c()
+	  dis_phy2 <- c()
+ 	  # identify haplotypes for each element
+ 	  index <-0
+ 	  for(pos in snv_index_noMisCol){
+ 	  	 # element index (snv_index, pos)
+ 	  	 pos_noMisCol <- pos_lst[[pos]]
+ 	  	 lower_vec <- which(pos_noMisCol<snv_index)
+ 	  	 upper_vec <- which(pos_noMisCol>snv_index)
+
+ 	  	 if(length(lower_vec)>=1 & length(upper_vec)>=1){
+ 	  	 		lower <- pos_noMisCol[max(lower_vec)]
+ 	  	 		upper <- pos_noMisCol[min(upper_vec)]
+ 	  	 		tp <- y[snv_index, pos]
+ 	  	 		tp <- gsub("/", "|", tp) 
+ 	  	 		geno <- c(y[lower,pos], tp, y[upper, pos])
+
+ 	  	 		part <-colsplit(geno,"\\|",c("i1","i2"))[,2]
+				part[part>1] <- 1
+				flag <- paste0(part[1],part[2],part[3])
+				hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2]))
+				dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index])
+
+ 	  	 		dis_vec <- c(dis_vec, dis[upper]-dis[lower])
+ 	  	 		dis_phy1 <- c(dis_phy1,  dis[snv_index]-dis[lower])
+ 	  	 		dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index])
+ 	  	 		geno_vec <-c(geno_vec,flag)
+ 	  	 }
+ 	  }
+
+ 	  sum <- 0
+ 	  sum_equal <- 0
+
+ 	  if(!is.null(dis_phy1) & sum(dis_phy1<len)){
+ 	  	 d_geno <- substr(geno_vec[which(dis_phy1<len)],1,2)
+ 	  	 sum <- length(d_geno)
+ 	  	 sum_equal <- sum(d_geno=="00" | d_geno=="11")
+ 	  }
+
+ 	  if(!is.null(dis_phy2) & sum(dis_phy2<len)){
+ 	  	 d_geno <- substr(geno_vec[which(dis_phy2<len)],2,3)
+ 	  	 sum <- length(d_geno) + sum
+ 	  	 sum_equal <- sum(d_geno=="00" | d_geno=="11") + sum_equal
+ 	  }
+
+
+ 	  if(sum>0){
+ 	  	  		p <- sum_equal/sum
+ 	  	  		check$prob_phy[cnt] <- p
+ 	  	  		check$tol_phy[cnt] <- sum
+ 	  	  		print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p, " ",len))
+ 	  }
+
+ 	  if(length(geno_vec)>0 ){
+ 	  	  dis_rd <-c(dis_rd, dis_vec)
+		  trio_hap_rd <-c(trio_hap_rd, geno_vec)
+ 	  	  #print(table(trio_hap_rd))
+ 	  	  sel <-list()
+ 	  	  sel <- table(geno_vec[dis_vec<1000000])
+
+ 	  	  if(length(sel)>0){
+ 	  	  		check$prob[cnt] <- 0
+ 	  	  		check$tol[cnt] <- sum(dis_vec<dis_limit)
+ 	  	  		check$dis[cnt] <- median(dis_vec[dis_vec<dis_limit])
+ 	  	  		#print(paste0("Scan ", cnt,":", snv, " ", sum, " ", (p_sum_000 + p_sum_111)/sum))
+
+ 	  	  }
+ 	  }
+}
+
+
+
+res <-list()
+res$dis <- dis_rd
+res$hap <- trio_hap_rd
+res$check <- check
+res$dis2 <- dis_rd2
+res$hap2 <- hap_rd2
+saveRDS(res, file=out)
+
+
+
+
+
diff --git a/src/LDrefinement.R b/src/LDrefinement.R
@@ -31,8 +31,7 @@ SNV_block <-function(summary=NULL){
 SVM_prepare <-function(x=NULL){
 	svm<-list()
 	region_cnt <- table(x$region)
-	### SNV hot-spot regions (including multi-alleles) ###
-	filter <- names(region_cnt)[region_cnt>=2]
+	filter <- names(region_cnt)[region_cnt>=4]
 	neg <- x[(x$region%in% filter & x$genotype==".|."),]
 	pos <- x[!x$genotype==".|.",]
 	svm$neg <- neg 
@@ -49,7 +48,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	label$pos[label$pos=="None"] <- NA
 	# using median values to replace the missing values 
 	train_x_pos <- impute(as.matrix(data.matrix(label$pos[,features])), what="median")
-	train_x_pos[is.na(train_x_pos)] <- 0
 
 	# using the minior value of QS 
 	vec <- train_x_pos[,colnames(train_x_pos)=="QS"]
@@ -59,15 +57,13 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	label$neg <- as.data.frame(label$neg)
 	label$neg[label$neg=="None"] <- NA
 	train_x_neg <- impute(as.matrix(data.matrix(label$neg[,features])), what="median")
-	train_x_neg[is.na(train_x_neg)] <- 0
 	vec <- train_x_neg[,colnames(train_x_neg)=="QS"]
 	vec[vec>0.5] <- 1- vec[vec>0.5]
 	train_x_neg[,1] <- vec
 
 	label$test <- as.data.frame(label$test)
 	label$test[label$test=="None"] <- NA
 	test_x <- impute(as.matrix(data.matrix(label$test[,features])), what="median")
-	test_x[is.na(test_x)] <- 0
 	vec <- test_x[,colnames(test_x)=="QS"]
 	vec[vec>0.5] <- 1- vec[vec>0.5]
 	test_x[,1] <- vec
@@ -83,10 +79,12 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	  print(p)
 	}
 	dev.off()
-
+
+
 	model <- svm(as.matrix(rbind(train_x_pos, train_x_neg)), 
 				 as.factor(c(train_y_pos,train_y_neg)), 
 				 probability=TRUE)
+  print("Done")
 	pred <- predict(model, as.matrix(test_x), probability=TRUE)
 	prob <- attr(pred, "probabilities")
 	prob <-as.data.frame(prob)
@@ -410,10 +408,11 @@ somaticLD <- function(mat=NULL, svm=NULL,  dir=NULL, region=NULL, min_size=50){
 
 dt <- fread(mat_gz)
 dt <- data.frame(dt)
-dt$V10[is.na(dt$V10)] <- ".|."
+dt$V10[dt$V10=="nan"] <- ".|."
 rownames(dt) <- paste0(dt$V1,":",dt$V2,":",dt$V3,":",dt$V4)
 
-cellName <- read.table(file=paste0(outdir,region,".cell.txt"),header=F)
+cellName <- read.csv(file=paste0(outdir,region,".cell_snv.cellID.filter.csv"),header=T)
+cellName <- cellName[,2]
 
 for(j in seq(11,18,1)){
   dt[,j] <- as.numeric(dt[,j])
@@ -422,7 +421,6 @@ meta <- dt[,1:18]
 colnames(meta) <-c("chr","pos","ref","alt","Dep","dep1","dep2","dep3","dep4",
 	"genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F")
 
-
 mutation_block <- SNV_block(summary=meta)
 svm_in <- SVM_prepare(mutation_block)
 svm_out <- SVM_train(label =svm_in,dir=outdir, region=region)
@@ -431,7 +429,25 @@ final <- somaticLD(mat=dt, svm=svm_out, dir=outdir, region=region)
 
 
 mut_mat <- dt[rownames(final$out),]
-colnames(mut_mat) <-c(colnames(meta),cellName[,1])
+colnames(mut_mat) <-c(colnames(meta),cellName)
+
+
+#### re-assign the sequencing depth for each allele
+
+for(i in seq(1, nrow(mut_mat),1)){
+  N_wild <- 0 
+  N_mut <- 0 
+  vec <- mut_mat[i,seq(19, ncol(mut_mat),1)]
+  sta <- unname(unlist(vec))
+  sta <- table(c(sta,"0/1","1/0","1/1"))
+  sta <- sta - 1 
+  N_alt <- sta["0/1"] + sta["1/1"]
+  N_ref <- sta["1/0"] + sta["1/1"]
+  LDrefine_somatic[i,6] <- N_ref 
+  LDrefine_somatic[i,7] <- N_alt
+  LDrefine_somatic[i,8] <- N_ref + N_alt
+  LDrefine_somatic[i,12] <- N_alt/(N_ref+N_alt)
+}
 
 write.csv(final$LDrefine_somatic,   paste0(outdir,region,".putativeSNVs.csv"),quote=FALSE,row.names = FALSE)
 write.csv(final$LDrefine_germline2, paste0(outdir,region,".germlineTwoLoci_model.csv"),quote=FALSE, row.names = FALSE)