• R中矩阵运算


    # 数据产生
    # rnorm(n, mean = 0, sd = 1) 正态分布的随机数(r 代表随机,可以替换成dnorm, pnorm, qnorm 作不同计算。r= random = 随机, d= density = 密度, p= probability = 概率 , q =quantile = 分位)
    # runif(n, min = 0, max = 1) 平均分布的随机数
    # rep(1,5) 把1重复5次
    # scale(1:5) 标准化数据
    > a <- c(rnorm(5), rnorm(5,1), runif(5), runif(5,-1,1), 1:5, rep(0,5), c(2,10,11,13,4), scale(1:5)[1:5])
    > a
     [1] -0.41253556  0.12192929 -0.47635888 -0.97171653  1.09162243  1.87789657
     [7] -0.11717937  2.92953522  1.33836620 -0.03269026  0.87540920  0.13005744
    [13]  0.11900686  0.76663940  0.28407356 -0.91251181  0.17997973  0.50452258
    [19]  0.25961316 -0.58052230  1.00000000  2.00000000  3.00000000  4.00000000
    [25]  5.00000000  0.00000000  0.00000000  0.00000000  0.00000000  0.00000000
    [31]  2.00000000 10.00000000 11.00000000 13.00000000  4.00000000 -1.26491106
    [37] -0.63245553  0.00000000  0.63245553  1.26491106
    > a <- matrix(a, ncol=5, byrow=T)
    > a
               [,1]       [,2]       [,3]       [,4]        [,5]
    [1,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026
    [3,]  0.8754092  0.1300574  0.1190069  0.7666394  0.28407356
    [4,] -0.9125118  0.1799797  0.5045226  0.2596132 -0.58052230
    [5,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
    [6,]  0.0000000  0.0000000  0.0000000  0.0000000  0.00000000
    [7,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000
    [8,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106
     
    # 求行的加和
    > rowSums(a)
    [1] -0.6470593  5.9959284  2.1751865 -0.5489186 15.0000000  0.0000000 40.0000000
    [8]  0.0000000
    
    # 去除全部为0的行
    > a <- a[rowSums(abs(a))!=0,]
    > a
               [,1]       [,2]       [,3]       [,4]        [,5]
    [1,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026
    [3,]  0.8754092  0.1300574  0.1190069  0.7666394  0.28407356
    [4,] -0.9125118  0.1799797  0.5045226  0.2596132 -0.58052230
    [5,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
    [6,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000
    [7,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106
     
    # 矩阵运算,R默认针对整个数据进行常见运算
    # 所有值都乘以2
    > a * 2
               [,1]       [,2]       [,3]       [,4]        [,5]
    [1,] -0.8250711  0.2438586 -0.9527178 -1.9434331  2.18324487
    [2,]  3.7557931 -0.2343587  5.8590704  2.6767324 -0.06538051
    [3,]  1.7508184  0.2601149  0.2380137  1.5332788  0.56814712
    [4,] -1.8250236  0.3599595  1.0090452  0.5192263 -1.16104460
    [5,]  2.0000000  4.0000000  6.0000000  8.0000000 10.00000000
    [6,]  4.0000000 20.0000000 22.0000000 26.0000000  8.00000000
    [7,] -2.5298221 -1.2649111  0.0000000  1.2649111  2.52982213
     
    # 所有值取绝对值,再取对数 (取对数前一般加一个数避免对0或负值取对数)
    > log2(abs(a)+1)
              [,1]      [,2]      [,3]      [,4]      [,5]
    [1,] 0.4982872 0.1659818 0.5620435 0.9794522 1.0646224
    [2,] 1.5250147 0.1598608 1.9743587 1.2255009 0.0464076
    [3,] 0.9072054 0.1763961 0.1622189 0.8210076 0.3607278
    [4,] 0.9354687 0.2387621 0.5893058 0.3329807 0.6604014
    [5,] 1.0000000 1.5849625 2.0000000 2.3219281 2.5849625
    [6,] 1.5849625 3.4594316 3.5849625 3.8073549 2.3219281
    [7,] 1.1794544 0.7070437 0.0000000 0.7070437 1.1794544
     
    # 取出最大值、最小值、行数、列数
    > max(a)
    [1] 13
    > min(a)
    [1] -1.264911
    > nrow(a)
    [1] 7
    > ncol(a)
    [1] 5
     
    # 增加一列或一行
    # cbind: column bind
    > cbind(a, 1:7)
               [,1]       [,2]       [,3]       [,4]        [,5] [,6]
    [1,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243    1
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026    2
    [3,]  0.8754092  0.1300574  0.1190069  0.7666394  0.28407356    3
    [4,] -0.9125118  0.1799797  0.5045226  0.2596132 -0.58052230    4
    [5,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000    5
    [6,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000    6
    [7,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106    7
    > cbind(a, seven=1:7)
                                                                 seven
    [1,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243     1
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026     2
    [3,]  0.8754092  0.1300574  0.1190069  0.7666394  0.28407356     3
    [4,] -0.9125118  0.1799797  0.5045226  0.2596132 -0.58052230     4
    [5,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000     5
    [6,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000     6
    [7,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106     7
     
    # rbind: row bind
    > rbind(a,1:5)
               [,1]       [,2]       [,3]       [,4]        [,5]
    [1,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026
    [3,]  0.8754092  0.1300574  0.1190069  0.7666394  0.28407356
    [4,] -0.9125118  0.1799797  0.5045226  0.2596132 -0.58052230
    [5,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
    [6,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000
    [7,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106
    [8,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
     
    # 计算每一行的mad (中值绝对偏差,一般认为比方差的鲁棒性更强,更少受异常值的影响,更能反映数据间的差异)1表示矩阵行,2表示矩阵列,也可以是c(1,2)
    > apply(a,1,mad)
    [1] 0.7923976 2.0327283 0.2447279 0.4811672 1.4826000 4.4478000 0.9376786
     
    # 计算每一行的var (方差)
    # apply表示对数据(第一个参数)的每一行 (第二个参数赋值为1) 或每一列 (2)操作。最后返回一个列表
    > apply(a,1,var)
    [1]  0.6160264  1.6811161  0.1298913  0.3659391  2.5000000 22.5000000  1.0000000
     
    # 计算每一列的平均值
    > apply(a,2,mean)
    [1] 0.4519068 1.6689045 2.4395294 2.7179083 1.5753421
     
    # 取出中值绝对偏差大于0.5的行
    > b = a[apply(a,1,mad)>0.5,]
    > b
               [,1]       [,2]       [,3]       [,4]        [,5]
    [1,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026
    [3,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
    [4,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000
    [5,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106
     
    # 矩阵按照mad的大小降序排列
    > c = b[order(apply(b,1,mad), decreasing=T),]
    > c
               [,1]       [,2]       [,3]       [,4]        [,5]
    [1,]  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000
    [2,]  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026
    [3,]  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
    [4,] -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106
    [5,] -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243
     
    > rownames(c) <- paste('Gene', letters[1:5], sep="_")
    > colnames(c) <- toupper(letters[1:5])
    > c
                    A          B          C          D           E
    Gene_a  2.0000000 10.0000000 11.0000000 13.0000000  4.00000000
    Gene_b  1.8778966 -0.1171794  2.9295352  1.3383662 -0.03269026
    Gene_c  1.0000000  2.0000000  3.0000000  4.0000000  5.00000000
    Gene_d -1.2649111 -0.6324555  0.0000000  0.6324555  1.26491106
    Gene_e -0.4125356  0.1219293 -0.4763589 -0.9717165  1.09162243
     
    # 矩阵转置
    > expr = t(c)
    > expr
      Gene_a      Gene_b Gene_c     Gene_d     Gene_e
    A      2  1.87789657      1 -1.2649111 -0.4125356
    B     10 -0.11717937      2 -0.6324555  0.1219293
    C     11  2.92953522      3  0.0000000 -0.4763589
    D     13  1.33836620      4  0.6324555 -0.9717165
    E      4 -0.03269026      5  1.2649111  1.0916224
     
    # 矩阵值的替换
    > expr2 = expr
    > expr2[expr2<0] = 0
    > expr2
      Gene_a   Gene_b Gene_c    Gene_d    Gene_e
    A      2 1.877897      1 0.0000000 0.0000000
    B     10 0.000000      2 0.0000000 0.1219293
    C     11 2.929535      3 0.0000000 0.0000000
    D     13 1.338366      4 0.6324555 0.0000000
    E      4 0.000000      5 1.2649111 1.0916224
     
    # 矩阵中只针对某一列替换
    # expr2是个矩阵不是数据框,不能使用列名字索引
    > expr2[expr2$Gene_b<1, "Gene_b"] <- 1
    Error in expr2$Gene_b : $ operator is invalid for atomic vectors
    # str是一个最为常用、好用的查看变量信息的工具,尤其是对特别复杂的变量,可以看清其层级结构,便于提取数据
    > str(expr2)
     num [1:5, 1:5] 2 10 11 13 4 ...
     - attr(*, "dimnames")=List of 2
      ..$ : chr [1:5] "A" "B" "C" "D" ...
      ..$ : chr [1:5] "Gene_a" "Gene_b" "Gene_c" "Gene_d" ...
     
    # 转换为数据框,再进行相应的操作
    > expr2 <- as.data.frame(expr2)
    > str(expr2)
    'data.frame':    5 obs. of  5 variables:
     $ Gene_a: num  2 10 11 13 4
     $ Gene_b: num  1.88 1 2.93 1.34 1
     $ Gene_c: num  1 2 3 4 5
     $ Gene_d: num  0 0 0 0.632 1.265
     $ Gene_e: num  0 0.122 0 0 1.092
    > expr2[expr2$Gene_b<1, "Gene_b"] <- 1
    > expr2
      Gene_a   Gene_b Gene_c    Gene_d    Gene_e
    A      2 1.877897      1 0.0000000 0.0000000
    B     10 1.000000      2 0.0000000 0.1219293
    C     11 2.929535      3 0.0000000 0.0000000
    D     13 1.338366      4 0.6324555 0.0000000
    E      4 1.000000      5 1.2649111 1.0916224
    R中矩阵筛选合并
    # 读入样品信息
    > sampleInfo = "Samp;Group;Genotype
    + A;Control;WT
    + B;Control;WT
    + D;Treatment;Mutant
    + C;Treatment;Mutant
    + E;Treatment;WT
    + F;Treatment;WT"
    > phenoData = read.table(text=sampleInfo, sep=";", header=T, row.names=1, quote="")
    > phenoData
          Group Genotype
    A   Control       WT
    B   Control       WT
    D Treatment   Mutant
    C Treatment   Mutant
    E Treatment       WT
    F Treatment       WT
     
    # 把样品信息按照基因表达矩阵中的样品信息排序,并只保留有基因表达信息的样品
    > phenoData[match(rownames(expr), rownames(phenoData)),]
          Group Genotype
    A   Control       WT
    B   Control       WT
    C Treatment   Mutant
    D Treatment   Mutant
    E Treatment       WT
     
    # 注意顺序,%in%比match更好理解一些
    > phenoData = phenoData[rownames(phenoData) %in% rownames(expr),]
    > phenoData
          Group Genotype
    A   Control       WT
    B   Control       WT
    C Treatment   Mutant
    D Treatment   Mutant
    E Treatment       WT
     
    # 合并矩阵
    # by=0 表示按照行的名字排序
    # by=columnname 表示按照共有的某一列排序
    # 合并后多出了新的一列Row.names
    > merge_data = merge(expr, phenoData, by=0, all.x=T)
    > merge_data
      Row.names Gene_a      Gene_b Gene_c     Gene_d     Gene_e     Group Genotype
    1         A      2  1.87789657      1 -1.2649111 -0.4125356   Control       WT
    2         B     10 -0.11717937      2 -0.6324555  0.1219293   Control       WT
    3         C     11  2.92953522      3  0.0000000 -0.4763589 Treatment   Mutant
    4         D     13  1.33836620      4  0.6324555 -0.9717165 Treatment   Mutant
    5         E      4 -0.03269026      5  1.2649111  1.0916224 Treatment       WT
     
    > rownames(merge_data) <- merge_data$Row.names
    > merge_data 
      Row.names Gene_a      Gene_b Gene_c     Gene_d     Gene_e     Group Genotype
    A         A      2  1.87789657      1 -1.2649111 -0.4125356   Control       WT
    B         B     10 -0.11717937      2 -0.6324555  0.1219293   Control       WT
    C         C     11  2.92953522      3  0.0000000 -0.4763589 Treatment   Mutant
    D         D     13  1.33836620      4  0.6324555 -0.9717165 Treatment   Mutant
    E         E      4 -0.03269026      5  1.2649111  1.0916224 Treatment       WT
     
    # 去除一列;-1表示去除第一列
    > merge_data = merge_data[,-1]
    > merge_data
      Gene_a      Gene_b Gene_c     Gene_d     Gene_e     Group Genotype
    A      2  1.87789657      1 -1.2649111 -0.4125356   Control       WT
    B     10 -0.11717937      2 -0.6324555  0.1219293   Control       WT
    C     11  2.92953522      3  0.0000000 -0.4763589 Treatment   Mutant
    D     13  1.33836620      4  0.6324555 -0.9717165 Treatment   Mutant
    E      4 -0.03269026      5  1.2649111  1.0916224 Treatment       WT
     
    # 提取出所有的数值列。sapply()对列表或者向量使用函数
    > merge_data[sapply(merge_data, is.numeric)]
      Gene_a      Gene_b Gene_c     Gene_d     Gene_e
    A      2  1.87789657      1 -1.2649111 -0.4125356
    B     10 -0.11717937      2 -0.6324555  0.1219293
    C     11  2.92953522      3  0.0000000 -0.4763589
    D     13  1.33836620      4  0.6324555 -0.9717165
    E      4 -0.03269026      5  1.2649111  1.0916224
    
  • 相关阅读:
    pip 的简单安装与基本使用
    macOS(OS X)安装与配置 Homebrew
    命令行媒体处理工具 FFmpeg
    修正 FreeBSD 字体锯齿问题
    在 Linux 环境直接复移动硬盘上的 GRUB
    命令行视频下载工具 youget 和 youtubedl
    20不努力,30做助理(转载)
    树、森林和二叉树的转换
    百度地图和solr展示资源和附近等功能的实现 二
    加密Web项目中数据库配置文件中的密码
  • 原文地址:https://www.cnblogs.com/freescience/p/7441833.html
Copyright © 2020-2023  润新知