mskcc · natir · Jun 18, 2021 · Jun 18, 2021
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,6 +8,6 @@ Maintainer: Venkatraman E. Seshan <seshanv@mskcc.org>
 Description: Algorithm to implement Fraction and Allelic Copy number
              Estimate from Tumor/normal Sequencing.
 License: GPL (>= 2)
-Depends: R (>= 3.0.0), pctGCdata (>= 0.3.0)
+Depends: R (>= 3.0.0), pctGCdata (>= 0.3.0), tidyverse (>= 1.2.1)
 Remotes: veseshan/pctGCdata
 NeedsCompilation: yes
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 useDynLib(facets)
 import(stats,graphics)
+import("tidyverse")
 importFrom("utils", "read.csv")
 importFrom("grDevices", "colorRampPalette", "rainbow")
 importFrom("pctGCdata", "getGCpct")

diff --git a/R/facets-findDipLogR.R b/R/facets-findDipLogR.R
@@ -93,7 +93,7 @@ findDiploidLogR <- function(out, cnlr) {
         if (diff(dipLogR) < log2(1.25)) 
             flags <- c(flags, "possibly subclonal 2+2 states present")
     # cat("dipLogR =", dipLogR, "\n")
-    # print(out0)
+
     # first remove the balanced segs
     out1 <- out0[-bsegs,]
     # is dipLogR[1] the 1+1 level; if so all lower cnlr.med values are losses
@@ -121,7 +121,6 @@ findDiploidLogR <- function(out, cnlr) {
         out2 <- t(sapply(1:nrow(out1), function(i, dlr, out1) {
                              acnsplit(dlr, out1$cnlr.median[i], out1$mafR[i])
                          }, dipLogR[1], out1))
-        # print(out2)
         out1$acn <- apply(out2[,2*(1:3), drop=FALSE], 1, which.min)
         # proportion of genome that fits 1+0, 2+0 & 3+0
         acn1prop <- sum(out1$num.mark[out1$acn == 1])/nsnps
@@ -207,7 +206,7 @@ findDiploidLogR <- function(out, cnlr) {
             }
         }
         #plot(ocnlevels0, dev1, pch=16, col=colr)
-        #print(dev1)
+
         cn2logR <- ocnlevels0[which.min(dev1)]
         # if wgd.likely is non-null add dipLogR[1]
         if (!is.null(wgd.likely)) {

diff --git a/R/facets-procreads.R b/R/facets-procreads.R
@@ -2,33 +2,33 @@
 procSnps <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, nX=23, unmatched=FALSE, ndepthmax=1000) {
     # keep only chromsomes 1-22 & X for humans and 1-19, X for mice
     # for other genomes (gbuild = udef) nX is number of autosomes plus 1
-    chromlevels <- c(1:(nX-1),"X")
-    chr.keep <- rcmat$Chromosome %in% chromlevels
-    # keep only snps with normal read depth between ndepth and 1000
-    depthN.keep <- (rcmat$NOR.DP >= ndepth) & (rcmat$NOR.DP < ndepthmax)
-    # reduce the data frame to these snps
-    rcmat <- rcmat[chr.keep & depthN.keep,]
-    # output data frame
-    out <- list()
-    out$chrom <- rcmat$Chromosome
-    out$maploc <- rcmat$Position
-    out$rCountT <- rcmat$TUM.DP
-    out$rCountN <- rcmat$NOR.DP
-    out$vafT <- 1 - rcmat$TUM.RD/rcmat$TUM.DP
-    out$vafN <- 1 - rcmat$NOR.RD/rcmat$NOR.DP
-    # make chromosome ordered and numeric
-    out$chrom <- as.numeric(ordered(out$chrom, levels=chromlevels))
+    rcmat <- rcmat %>%
+        tibble::add_column(vafT = 1 - rcmat$TUM.RD/rcmat$TUM.DP) %>%
+        tibble::add_column(vafN = 1 - rcmat$NOR.RD/rcmat$NOR.DP) %>%
+        dplyr::filter(Chromosome %in% c(1:(nX-1), "X")) %>%
+        dplyr::filter((NOR.DP >= ndepth) & (NOR.DP < ndepthmax)) %>%
+        dplyr::filter(TUM.DP>0) %>%
+        dplyr::rename(chrom = Chromosome) %>%
+        dplyr::rename(maploc = Position) %>%
+        dplyr::rename(rCountT = TUM.DP) %>%
+        dplyr::rename(rCountN = NOR.DP) %>%
+
+        dplyr::arrange(chrom)
+
+
     # call a snp heterozygous if min(vafN, 1-mafN) > het.thresh
     if (unmatched) {
         if (het.thresh == 0.25) het.thresh <- 0.1
-        out$het <- 1*(pmin(out$vafT, 1-out$vafT) > het.thresh & out$rCountT >= 50)
+        rcmat$het <- 1*(pmin(rcmat$vafT, 1-rcmat$vafT) > het.thresh & rcmat$rCountT >= 50)
     } else {
-        out$het <- 1*(pmin(out$vafN, 1-out$vafN) > het.thresh)
+        rcmat$het <- 1*(pmin(rcmat$vafN, 1-rcmat$vafN) > het.thresh)
     }
     # scan maploc for snps that are close to one another (within snp.nbhd bases)
     # heep all the hets (should change if too close) and only one from a nbhd
-    out$keep <- scanSnp(out$maploc, out$het, snp.nbhd)
-    as.data.frame(out)
+    rcmat$keep <- scanSnp(rcmat$maploc, rcmat$het, snp.nbhd)
+    rcmat <- rcmat %>%
+        dplyr::filter(keep == 1)
+    as.data.frame(rcmat)
 }
 
 scanSnp <- function(maploc, het, nbhd) {
@@ -43,33 +43,33 @@ scanSnp <- function(maploc, het, nbhd) {
 }
 
 # obtain logR and logOR from read counts and GC-correct logR
-counts2logROR <- function(mat, gbuild, unmatched=FALSE, ugcpct=NULL, f=0.2) {
-    out <- mat[mat$keep==1,]
+counts2logROR <- function(mat, gbuild, unmatched=FALSE, ugcpct=NULL, f=0.2, nX=23) {
     # gc percentage
-    out$gcpct <- rep(NA_real_, nrow(out))
+    mat$gcpct <- rep(NA_real_, nrow(mat))
     # get GC percentages from pctGCdata package
     # loop thru chromosomes
-    nchr <- max(mat$chrom) # IMPACT doesn't have X so only 22
+
+    nchr <- nX - 1 # IMPACT doesn't have X so only 22
     for (i in 1:nchr) {
-        ii <- which(out$chrom==i)
+        ii <- which(mat$chrom==i)
         # allow for chromosomes with no SNPs i.e. not targeted
         if (length(ii) > 0) {
             if (gbuild == "udef") {
-                out$gcpct[ii] <- getGCpct(i, out$maploc[ii], gbuild, ugcpct)
+                mat$gcpct[ii] <- getGCpct(i, mat$maploc[ii], gbuild, ugcpct)
             } else {
-                out$gcpct[ii] <- getGCpct(i, out$maploc[ii], gbuild)
+                mat$gcpct[ii] <- getGCpct(i, mat$maploc[ii], gbuild)
             }
         }
     }
     ##### log-ratio with gc correction and maf log-odds ratio steps
-    chrom <- out$chrom
-    maploc <- out$maploc
-    rCountN <- out$rCountN
-    rCountT <- out$rCountT
-    vafT <- out$vafT
-    vafN <- out$vafN
-    het <- out$het
-    gcpct <- out$gcpct
+    chrom <- mat$chrom
+    maploc <- mat$maploc
+    rCountN <- mat$rCountN
+    rCountT <- mat$rCountT
+    vafT <- mat$vafT
+    vafN <- mat$vafN
+    het <- mat$het
+    gcpct <- mat$gcpct
     # compute gc bias
     ncount <- tapply(rCountN, gcpct, sum)
     tcount <- tapply(rCountT, gcpct, sum)
@@ -98,10 +98,10 @@ counts2logROR <- function(mat, gbuild, unmatched=FALSE, ugcpct=NULL, f=0.2) {
         lorvar[het==1] <- (1/(rcmat[,1]+0.5) + 1/(rcmat[,2]+0.5) + 1/(rcmat[,3]+0.5) + 1/(rcmat[,4]+0.5))
     }
     # put them together
-    out$lorvar <- out$valor <- out$cnlr <- out$gcbias <- rep(NA_real_, nrow(out))
-    out$gcbias <- gcbias
-    out$cnlr <- cnlr
-    out$valor <- valor
-    out$lorvar <- lorvar
-    out
+    mat$lorvar <- mat$valor <- mat$cnlr <- mat$gcbias <- rep(NA_real_, nrow(mat))
+    mat$gcbias <- gcbias
+    mat$cnlr <- cnlr
+    mat$valor <- valor
+    mat$lorvar <- lorvar
+    mat
 }
diff --git a/R/facets-segment.R b/R/facets-segment.R
@@ -117,7 +117,7 @@ prune.cpt.tree <- function(seg.tree, cval=25) {
 }
 
 # segment by looping over the chromosomes
-segsnps <- function(mat, cval=25, hetscale=FALSE, delta=0) {
+segsnps <- function(mat, cval=25, hetscale=FALSE, delta=0, nX=23) {
     # keep the original data
     mat0 <- mat
     # keep only rows that have finite values for cnlr
@@ -137,7 +137,7 @@ segsnps <- function(mat, cval=25, hetscale=FALSE, delta=0) {
     # initialize segment indicator
     mat$seg <- rep(NA_real_, nrow(mat))
     # loop over chromosomes
-    nchr <- max(mat$chrom) # IMPACT doesn't have X so only 22
+    nchr <- nX - 1 # IMPACT doesn't have X so only 22
     # possible chromosomes
     chrs <- 1:nchr
     # initialize segmentation tree
@@ -169,15 +169,19 @@ segsnps <- function(mat, cval=25, hetscale=FALSE, delta=0) {
 jointsegsummary <- function(jointseg) {
     # remove snps with NA in segs (due to NA in cnlr)
     jointseg <- jointseg[is.finite(jointseg$seg),]
+
     # initialize output table
     nsegs <- max(jointseg$seg)
+
     # segment start and end indices and number of loci
-    seg.start <- which(diff(c(0,jointseg$seg))==1)
+    seg.start <- which(diff(c(0, jointseg$seg)) == 1)
     seg.end <- c(seg.start[-1]-1, nrow(jointseg))
     num.mark <- seg.end - seg.start + 1
+
     # initialize the output
     out <- as.data.frame(matrix(0, nsegs, 6))
     names(out) <- c("chrom","seg","num.mark","nhet","cnlr.median","mafR")
+
     # function to estimate maf from valor and lorvar
     maffun <- function(x) {
         # occasional extreme valor can screw maf. so winsorize the maf
@@ -191,8 +195,11 @@ jointsegsummary <- function(jointseg) {
         valor[valor < valor.thresh] <- valor.thresh
         sum(((valor)^2 - lorvar)/lorvar)/sum(1/lorvar)
     }
+
     # loop over the segments
     for (seg in 1:nsegs) {
+        if (is.na(seg.start[seg]) && is.na(seg.end[seg]))
+            break
         zhet <- jointseg$het[seg.start[seg]:seg.end[seg]]
         out[seg, 1] <- jointseg$chrom[seg.start[seg]]
         out[seg, 2] <- seg
@@ -202,6 +209,7 @@ jointsegsummary <- function(jointseg) {
         if (out[seg, 4] > 0) {
             zvalor <- jointseg$valor[seg.start[seg]:seg.end[seg]]
             zlorvar <- jointseg$lorvar[seg.start[seg]:seg.end[seg]]
+
             zz1 <- data.frame(valor=zvalor[zhet == 1], lorvar=zlorvar[zhet == 1])
             out[seg, 6] <- maffun(zz1)
         }