1.源码
function (x, y, wt = NULL, intercept = TRUE, tolerance = 1e-07, yname = NULL) { x <- as.matrix(x) y <- as.matrix(y) xnames <- colnames(x)#x的列名 if (is.null(xnames)) { if (ncol(x) == 1L) #赋予列名 xnames <- "X" else xnames <- paste0("X", 1L:ncol(x)) } if (intercept) {#如果有截距的话,那么就多赋值一个列。 x <- cbind(1, x)#按列合并,那么行数一样,即每行为一个样本 xnames <- c("Intercept", xnames) } if (is.null(yname) && ncol(y) > 1) yname <- paste0("Y", 1L:ncol(y)) good <- complete.cases(x, y, wt)#去除NA值。这里暗示了x y行数是一样的? dimy <- dim(as.matrix(y)) if (any(!good)) {#如果至少有一个是false warning(sprintf(ngettext(sum(!good), "%d missing value deleted", "%d missing values deleted"), sum(!good)), domain = NA) x <- as.matrix(x)[good, , drop = FALSE] y <- as.matrix(y)[good, , drop = FALSE] wt <- wt[good] } nrx <- NROW(x) ncx <- NCOL(x) nry <- NROW(y) ncy <- NCOL(y) nwts <- length(wt) if (nry != nrx) #如果x的样本数与y的样本数不相等 stop(sprintf(paste0(ngettext(nrx, "'X' matrix has %d case (row)", "'X' matrix has %d cases (rows)"), ", ", ngettext(nry, "'Y' has %d case (row)", "'Y' has %d cases (rows)")), nrx, nry), domain = NA) if (nry < ncx) stop(sprintf(paste0(ngettext(nry, "only %d case", "only %d cases"), ", ", ngettext(ncx, "but %d variable", "but %d variables")), nry, ncx), domain = NA) if (!is.null(wt)) {#用于加权最小二乘 if (any(wt < 0)) stop("negative weights not allowed") if (nwts != nry) stop(gettextf("number of weights = %d should equal %d (number of responses)", nwts, nry), domain = NA) wtmult <- sqrt(wt) if (any(wt == 0)) { xzero <- as.matrix(x)[wt == 0, ] yzero <- as.matrix(y)[wt == 0, ] } x <- x * wtmult y <- y * wtmult invmult <- 1/ifelse(wt == 0, 1, wtmult) } z <- .Call(C_Cdqrls, x, y, tolerance, FALSE)#调用C中的函数 resids <- array(NA, dim = dimy) dim(z$residuals) <- c(nry, ncy) if (!is.null(wt)) { if (any(wt == 0)) { if (ncx == 1L) fitted.zeros <- xzero * z$coefficients else fitted.zeros <- xzero %*% z$coefficients z$residuals[wt == 0, ] <- yzero - fitted.zeros } z$residuals <- z$residuals * invmult } resids[good, ] <- z$residuals#所有不含NA的行的残差被赋值为计算所得的残差 if (dimy[2L] == 1 && is.null(yname)) {#如果y只有一列 resids <- drop(resids)#此时会变成一个向量 names(z$coefficients) <- xnames#系数被赋值为x的列名, #因为一列正好对应的是一个变量,行对应的是样本数 } else { colnames(resids) <- yname colnames(z$effects) <- yname dim(z$coefficients) <- c(ncx, ncy)#是5行,100列,正好就是系数矩阵H dimnames(z$coefficients) <- list(xnames, yname)#确实是对应, #行是细胞类型的名字,列是基因名字。 } z$qr <- as.matrix(z$qr) colnames(z$qr) <- xnames output <- list(coefficients = z$coefficients, residuals = resids) #输出中有系数和残差 if (z$rank != ncx) { xnames <- xnames[z$pivot] dimnames(z$qr) <- list(NULL, xnames) warning("'X' matrix was collinear") } if (!is.null(wt)) { weights <- rep.int(NA, dimy[1L]) weights[good] <- wt output <- c(output, list(wt = weights)) } rqr <- list(qt = drop(z$effects), qr = z$qr, qraux = z$qraux, rank = z$rank, pivot = z$pivot, tol = z$tol) oldClass(rqr) <- "qr" output <- c(output, list(intercept = intercept, qr = rqr)) #这里intercept只是一个布尔值??? return(output) }
//感觉很坑啊,计算的部分是使用了
z <- .Call(C_Cdqrls, x, y, tolerance, FALSE)#调用C中的函数
> stats:::C_Cdqrls $`name` [1] "Cdqrls" $address <pointer: 0x00000000025f96b0> attr(,"class") [1] "RegisteredNativeSymbol" $dll DLL name: stats Filename: D:/RRsetup/R-3.5.1/library/stats/libs/x64/stats.dll Dynamic lookup: FALSE $numParameters [1] 4 attr(,"class") [1] "CallRoutine" "NativeSymbolInfo"
这个就进行了最小二乘,具体的过程还是看不到....
//那似乎这个lsfit函数除了调用C函数计算之外,似乎也没什么了,就是去掉NA值,并且对结果进行整理,赋值列名行名之类的。
2.其中一些函数学习
2.1 paste0函数
> xy<-paste0("X",1:5) > xy [1] "X1" "X2" "X3" "X4" "X5"
就是对其进行粘贴,形成一个字符串向量。
2.2 complete.cases函数——去除空值
转自:http://blog.sina.com.cn/s/blog_59990a450101qnvy.html
Value返回值为 A logical vector specifying which observations/rows have no missing values across the entire sequence. 一个逻辑向量,指明哪一行有缺失值NA
下面用实例来说明这两个函数的作用:
这是一个数据框final:
gene hsap mmul mmus rnor cfam
1 ENSG00000208234 0 NA NA NA NA
2 ENSG00000199674 0 2 2 2 2
3 ENSG00000221622 0 NA NA NA NA
4 ENSG00000207604 0 NA NA 1 2
5 ENSG00000207431 0 NA NA NA NA
6 ENSG00000221312 0 1 2 3 2
如果要去除有NA的行,则可用:
final[complete.cases(final),]
也可用 na.omit(final)
//对行进行检测,没有NA的行返回的是true,那么自然被保存。
上述运行结果如下:
gene hsap mmul mmus rnor cfam
2 ENSG00000199674 0 2 2 2 2
6 ENSG00000221312 0 1 2 3 2
如果想过滤部分列:
final[complete.cases(final[,5:6]),]
即只对5、6列进行判断。
gene hsap mmul mmus rnor cfam
2 ENSG00000199674 0 2 2 2 2
4 ENSG00000207604 0 NA NA 1 2
6 ENSG00000221312 0 1 2 3 2
这样第四行含有空值,但是,我们的命令是只过滤掉第5列,第6列中含有NA的行。
2.3 any函数
The value returned is TRUE if at least one of the values in x is TRUE, and FALSE if all of the values in x are FALSE
//即如果有一个值为真,那么则返回真;如果全部为假,那么则返回假。
> range(x <- sort(round(stats::rnorm(10) - 1.2, 1))) [1] -2.6 0.5 > if(any(x < 0)) cat("x contains negative values ") x contains negative values
#
2.4 range函数
取向量中最大值与最小值。
> (r.x <- range(stats::rnorm(100))) [1] -1.805257 3.410545 #产生100个服从正态分布的数,使用range函数取最值。
2.5 rnorm函数
rnorm(n, mean = 0, sd = 1) 产生n个符合生态分布的数,缺省的均值是0,标准差是1
2.6 ngettext函数
> for(n in 0:3) + print(sprintf(ngettext(n, "%d variable has missing values", + "%d variables have missing values"), + n)) [1] "0 variables have missing values" [1] "1 variable has missing values" [1] "2 variables have missing values" [1] "3 variables have missing values"
//我认为是和C语言中的printf是比较像的,就是将其中的数进行匹配即可。
2.7 stop函数
> iter <- 12 > try(if(iter > 10) stop("too many iterations")) Error in try(if (iter > 10) stop("too many iterations")) : too many iterations #能够终止函数的执行
2.8 drop函数
对于多维数据,去掉长度为1的维度。
> dim(drop(array(1:12, dim = c(1,3,1,1,2,1,2)))) [1] 3 2 2
上例中,即去掉第一维、第三四维、第六维这四个维度,那么还剩下的是这些。
> x<-drop(array(1:12, dim = c(1,3,1,1,2,1,2))) > x , , 1 [,1] [,2] [1,] 1 4 [2,] 2 5 [3,] 3 6 , , 2 [,1] [,2] [1,] 7 10 [2,] 8 11 [3,] 9 12
dim是给数组赋予维数:
> z<-c(1:6) > dim(z)<-c(2,3) > z [,1] [,2] [,3] [1,] 1 3 5 [2,] 2 4 6
> z<-c(1:6) > dim(z)<-c(1,6) > z [,1] [,2] [,3] [,4] [,5] [,6] [1,] 1 2 3 4 5 6 > drop(z) [1] 1 2 3 4 5 6 > z<-c(1:6) > dim(z)<-c(6,1) > z [,1] [1,] 1 [2,] 2 [3,] 3 [4,] 4 [5,] 5 [6,] 6 > drop(z) [1] 1 2 3 4 5 6 #试验如果维度有一个为1的话,那么会是什么情况
那么似乎是去掉那个为1的维度,然后其他正常,此时
> z<-drop(z) > dim(z) NULL