• R语言学习笔记之八


    摘要: 仅用于记录R语言学习过程:

    内容提要:

    字符串的处理、正则表达式、stringi包和stringr包

    正文:

      字符串的处理

    n  导读:

    nchar(x)函数:字符串的个数:

    > x <- c('fudan','jiaoda')

    > nchar(x)

    [1] 5 6   #返回字符串的个数

    length()函数:返回元素的个数

    > length(x)

    [1] 2

    u  toupper()函数:小写转大写

    > toupper('abc')

    [1] "ABC"

    u  tolower()函数:大写转小写

    > tolower('ABKC')

    [1] "abkc"

    u  paste()函数:(seq参数和collapse参数)粘贴功能

    > stringa <- LETTERS[1:5]

    > STRINGB <- 1:5

    > paste(stringa,STRINGB)

    [1] "A 1" "B 2" "C 3" "D 4" "E 5"

    > paste(stringa,STRINGB,seq = '-')  #seq分隔符

    [1] "A 1 -" "B 2 -" "C 3 -" "D 4 -" "E 5 -"

    > paste(stringa,STRINGB,collapse = '-')   # collapse分隔符

    [1] "A 1-B 2-C 3-D 4-E 5"

    u  paste0()函数:去掉了A和1之间的空格,seq和collapse的表型也不同

    > paste0(stringa,STRINGB)

    [1] "A1" "B2" "C3" "D4" "E5"

    > paste0(stringa,STRINGB,seq = '-')

    [1] "A1-" "B2-" "C3-" "D4-" "E5-"

    > paste0(stringa,STRINGB,collapse = '-')

    [1] "A1-B2-C3-D4-E5"

    u  strsplit()函数:字符串拆分功能

    > stringC <- paste(stringa, STRINGB, seq = '/')

    > strsplit(stringC,split = '/')   #根据/ 进行拆分

    [[1]]

    [1] "A 1 "

    [[2]]

    [1] "B 2 "

    [[3]]

    [1] "C 3 "

    [[4]]

    [1] "D 4 "

    [[5]]

    [1] "E 5 "

    u  substr()函数:字符串截取函数;同时具有赋值功能

    > stringd <- c('python','java','ruby','php','linux')

    > sub_str <- substr(stringd,start = 2,stop = 4) #截取2-4位的字符,如果不够,就有几个返回几个

    > sub_str

    [1] "yth" "ava" "uby" "hp"  "inu"

    #实现赋值的功能

    > substr(stringd,start = 2,stop = 4) <- 'aaa'

    > stringd

    [1] "paaaon" "jaaa"   "raaa"   "paa"    "laaax"

    grep()函数:用于提取字符串中指定的字符,可返回位置,也可返回具体的值。

    > seq_names <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',

    +                'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',

    +                'NA_USA03_C2_S2007','NA USA04 A3 2004',

    +                'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')

    > fra_seq <- grep(pattern = 'FRA|fra',x =seq_names)

    > fra_seq

    [1]  1  5 11

    > seq_names[fra_seq]

    [1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" 

    [3] "eu_fra_a2_s98"   

    > fra_seq <- grep(pattern = 'FRA|fra',x =seq_names,value = TRUE)

    > fra_seq

    [1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" 

    [3] "eu_fra_a2_s98"

    u  grepl()函数:返回的是逻辑值。没有value参数。ignore.case参数表示是否忽略大小写,TRUE为忽略。

    > grepl(pattern = 'FRA|fra',x =seq_names)

     [1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

    [10] FALSE  TRUE FALSE

    > fra_seq <- grepl(pattern = 'FRA|fra',x =seq_names,value = TRUE)  #或

    u  正则表达式:提取元素

    > spe_seq <- seq_names[!grepl(pattern = '[s|S][0-9]{2,4}\b',seq_names)]  #匹配右边界

    > spe_seq

    [1] "AF_COM12_80_20014" "AS_CHN11_C3_2004"

    [3] "NAUSA02E02005"     "AS_CHN12_N0_05"  

    [5] "NA USA04 A3 2004"  "EU_UK01_A0_2009" 

    [7] "SA/BRA08/00/1996"

    找到以ab开头的

    my_string <- c('above','about','abrotion','cab')

    grep(pattern = '\bab',x = my_string,value = T) #匹配左边界

    u  gsub()函数:把字符串变成数值,会把找到的所有字符都替换掉

    money <- c('$1888','$2888','$3888')

    gsub('\$',replacement = '',money)

    as.numeric(money)

    u  sub()函数:只会替换掉找到的第一个字符

    > money <- c('$1888 $2888 $3888')

    > sub('\$',replacement = '',money)

    [1] "1888 $2888 $3888"

    > gsub('\$',replacement = '',money)

    [1] "1888 2888 3888"

    regexpr()函数

    > test_string <- c('happy','apple','application','apolitic')

    > regexpr('pp',test_string)

    [1]  3  2  2 -1   #返回pp出现的位置,-1表示没有

    attr(,"match.length")

    [1]  2  2  2 -1

    attr(,"useBytes")

    [1] TRUE

    > test_string[regexpr('pp',test_string)>0]  #提取含pp的字符串

    [1] "happy"       "apple"       "application"

    gregexpr()函数:同regexpr()函数

    regexec()函数:同regexpr()函数

    u  agrep()函数:可以匹配英美单词不同写法

    > string1 <- c('I need a favour','my favorite sport','you made an error')

    > agrep('favor',string1)

    [1] 1 2

      正则表达式

    n  原义表达式:只代表自己

    > mystring1 <- c('apple','orange')

    > grep('p',mystring1)

    [1] 1

    n  转义表达式:代表其他含义

    > # .所有字符

    > mystring2 <- c('shudo','.dfs','-dsfd')

    > grep('.',mystring2)

    [1] 1 2 3

    >

    > mystring3 <- c('9anv','fss7','1000','ss7')

    > grep('[7-9]',mystring3)

    [1] 1 2 4

    >

    > # ^a,匹配a开头的

    > mystring4 <- c('apple','application','abb')

    > grep('^ap',mystring4)

    [1] 1 2

    > # [^]表示不是0-1

    > mystring5 <- c('9anv','fss7','1000','ss7')

    > grep('[^0-1]',mystring5)

    [1] 1 2 4

    > #{}代表重复的次数,{1,}表示重复大于1次

    > mystring6 <- c('1220','2289','2228','10002')

    > grep('2{2,3}',mystring6)

    [1] 1 2 3

    > # + 表示其最靠近的字符重复多次,()表示把括号内的内容看成一个整体

    > mystring7 <- c('food','foot','foul','fans')

    > grep ('fo+',mystring7)

    [1] 1 2 3

    > grep('fo{1,}',mystring7)

    [1] 1 2 3

    > grep('(fo){1,}',mystring7)

    [1] 1 2 3

    >

    > #* 匹配0次或以上

    > #| 管道符  或,满足其中之一就可被返回

    >

    > mystring8 <- c('kobe','messi','neymar')

    > grep('^k|^m',mystring8)

    [1] 1 2

    > # $表示匹配字符串末尾

    > mystring9 <- c('active','positive','negative','iention')

    > grep('ive$',mystring9)  #匹配字符串末尾

    [1] 1 2 3

    > grep('ive\b',mystring9)

    [1] 1 2 3

    n  保义字符:

    #

    mystring10 <- c('ac^bb','^df')

    grep('\^',mystring10)

    [1] 1 2

    \d = [0-9]  匹配数字0-9

    \D = [^0-9] 匹配非数字

    \s   匹配空白字符,空格,制表符,换行符

    \S  匹配非空白字符

    \w  匹配字母和数字   =[a-zA-Z0-9]

    \W  匹配非字母和数字  =[^a-zA-Z0-9]

    \b   匹配字符的边界

    \B   匹配字符的非边界

    \<   匹配以空白字符开始的文本  如‘ string’

    \>   匹配以空白字符结束的文本  如‘string ’

    示例:

    > mystring11 <- c('2013','abcd','13sg')

    > grep('\d',mystring11)

    [1] 1 3

    > grep('\D',mystring11)

    [1] 2 3

    > mystring12 <- c('foo t','    able','   moth  er','happy')

    > grep('\s',mystring12)

    [1] 1 2 3

    > grep('\S',mystring12)

    [1] 1 2 3 4

    > mystring13 <- c('theory','the republic','they')

    > grep('\<the\>',mystring13)   #以the作为边界的字符串,the为一个单独的单词

    [1] 2

      stringr与stringi包

    n  stringi包更加依赖正则表达式

    stringr中的常用函数

    str_c()函数:类似paste()函数

    > str_c('a','b')

    [1] "ab"

    > str_c('a','b',sep = '-')

    [1] "a-b"

    str_length()函数:用于字符串计数

    > str_length('abdc')

    [1] 4

    str_sub()函数:用于字符串提取,类似substr()函数,有三个参数:数据名,开始位置,结束位置(可以接受向量),可以接受赋值

    > yxf <- 'yi xue fang'

    > str_sub(yxf,c(1,4,8),c(2,6,11))

    [1] "yi"   "xue"  "fang"

    >

    > str_sub(yxf,1,1) <- 'Y'     #可以接受赋值

    > yxf

    [1] "Yi xue fang"

    str_dup()函数:用于复制

    > fruit <- c('apple','pear','banana')

    > str_dup(fruit,2)

    [1] "appleapple"   "pearpear"     "bananabanana"

    > fruit <- c('apple','pear','banana')

    > str_dup(fruit,2:4)

    [1] "appleapple"               "pearpearpear"           

    [3] "bananabananabananabanana"

    str_trim()函数:去掉字符串首尾的空格,也可以设置成right和left,分别去掉右边和左边的空格

    > string <- ' Eternal love for YanQ '

    > str_trim(string,side = 'both')

    [1] "Eternal love for YanQ"

    str_extract()函数:用于提取

    phones <- c('219 733 8965','329-293-8753','banana','595 794 7569',

                '387 287 6718','apple','233.398.9187','482 952 3315',

                '239 923 8115 and 842 566 4692','Work: 579-499-7527','$1000',

                'Home:543.355.3679')

    str_extract(phones,'([0-9]{3})[- .]([0-9]{3})[- .]([0-9]{4})\b')

    [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718"

     [6] NA             "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"

    [11] NA             "543.355.3679"

    或写成:str_extract(phones,'([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})')

    str_replace()函数:用于字符串替换,只替换找到的第一个

    > fruits <- c('one apple','two pears','three bananas')

    > str_replace(fruits,'[aeiou]','-')  #[被替换的对象] ,‘拟替换成的对象’

    [1] "-ne apple"     "tw- pears"     "thr-e bananas"

    str_replace_all()函数:替换所有

    > fruits <- c('one apple','two pears','three bananas')

    > str_replace_all(fruits,'[aeiou]','-')

    [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

    n  stringi中的常用函数

    u  stri_join()函数:

    > stri_join(1:7,letters[1:7],sep = '-')

    [1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"

    > stri_join(1:7,letters[1:7],collapse = '-')

    [1] "1a-2b-3c-4d-5e-6f-7g"

    u  stri_cmp_eq() & stri_cmp_neq()函数:

    > stri_cmp_eq('ab','ab')

    [1] TRUE

    > stri_cmp_neq('ab','ab')

    [1] FALSE

    u  stri_cmp_lt() & stri_cmp_gt()函数:用于字符串比大小,lt 前者小于后者,gt前者大于后者

    > stri_cmp_lt('121','221')

    [1] TRUE

    > stri_cmp_lt('a121','b221')

    [1] TRUE

    > stri_cmp_gt('121','221')

    [1] FALSE

    u  stri_count()函数:用于计数

    > language <- c('python','R','PHP','Ruby','Java',

    +               'JavaScript','C','Oracle','C++','C#','Spark',

    +               'Go','Room','Good','Pathon','ScriptJava','R2R','C+','C*')

    > stri_count(language,fixed = 'R')

     [1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0

    > stri_count(language,regex = '^J')

          [1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0

    u  stri_count_boundaries()函数:字符串元素个数的计数

    > test <- 'Theu00a0above-mentioned     features are very useful.

    + Warm thanks to their developers. Tomorrow is a ,new$% day###'

    > stri_count_boundaries(test,type = 'word')

    [1] 45

    > stri_count_boundaries(test,type = 'sentence')

    [1] 3

    > stri_count_boundaries(test,type = 'character')

    [1] 110

    u  stri_duplicated()函数:识别重复的字符串

    > stri_duplicated(c('a','b','a',NA,'a',NA))

    [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE

    > stri_duplicated(c('a','b','a',NA,'a',NA),fromLast = T)  #从最后开始看

    [1]  TRUE FALSE  TRUE  TRUE FALSE FALSE

    > stri_duplicated_any(c('a','b','a',NA,'a',NA))

    [1] 3

    u  stri_dup()函数:重复

    > stri_dup(c('abc','parst'),c(4,2))

    [1] "abcabcabcabc" "parstparst" 

    u  stri_detect_fixec()函数:发现匹配函数

    > stri_detect_fixed(c('stringi R','REXAMINE','123'),c('i','R','0'))

    [1]  TRUE  TRUE FALSE

    u  stri_detect_regex()函数:

    > stri_detect_regex(c('above','abort','about','abnormal','abandon'),'^ab')

    [1] TRUE TRUE TRUE TRUE TRUE

    > stri_detect_regex(c('above','abort','about','abnormal','abandon'),'t\b')

    [1] FALSE  TRUE  TRUE FALSE FALSE

    > stri_detect_regex(c('ABOUT','abort','AboVE'),'^ab',case_insensitive = TRUE)  #忽略大小写

    [1] TRUE TRUE TRUE

    u  stri_startswith_fixed()函数:

    > stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a')

    [1]  TRUE  TRUE FALSE  TRUE FALSE

    >

    > stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a1')

    [1]  TRUE FALSE FALSE FALSE FALSE

    >

    > stri_startswith_fixed(c('abaDc','aabadc','ababa'),'ba',from = 2)  #从哪个字符开始匹配,从第二个字符开始匹配

    [1]  TRUE FALSE  TRUE

    u  stri_endswith_fixed()函数:

    > stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba')

    [1] FALSE FALSE  TRUE

    > stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba', to = 3)  #匹配到第几位,匹配到第三位

    [1]  TRUE FALSE  TRUE

    u  stri_extract_all()函数:提取

    > tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',

    +                'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',

    +                'NA_USA03_C2_S2007','NA USA04 A3 2004',

    +                'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')

    >

    > # Generate a strings composed by several sequence names.

    >

    > stri_extract_all(tEmp_text,regex = '[0-9]{2,4}\b')

    [[1]]

    [1] "2008"

    [[2]]

    [1] "0014"

    [[3]]

    [1] "2008"

    [[4]]

    [1] "2004"

    [[5]]

    [1] "2007"

    [[6]]

    [1] "2005"

    [[7]]

    [1] "05"

    [[8]]

    [1] "2007"

    [[9]]

    [1] "04"   "2004"

    [[10]]

    [1] "2009"

    [[11]]

    [1] "98"

    [[12]]

    [1] "08"   "00"   "1996"

    u  stri_extract_all_fixed()函数:

    > stri_extract_all_fixed('abaBAba','Aba',case_insensitive = T, overlap =T)

    [[1]]   #可交叉

    [1] "aba" "aBA" "Aba"

    u  stri_extract_all_boundaries()函数:提取字符串的边界

    > stri_extract_all_boundaries('stringi: THE string processing package 123.48...')

    [[1]]

    [1] "stringi: "   "THE "        "string "     "processing " "package "  

    [6] "123.48..."   #但是带出来单词后面的空格

    u  stri_extract_all_words()函数:提取字符串的边界,去掉空格

    > stri_extract_all_words('stringi: THE string processing package 123.48...')

    [[1]]

    [1] "stringi"    "THE"        "string"     "processing" "package"    "123.48"

    u  stri_isempty()函数:字符串内是否为空

    > stri_isempty(c(',','','abc','123','u0105u0104',' '))

    [1] FALSE  TRUE FALSE FALSE FALSE FALSE

    u  stri_locate_all()函数:定位函数

    > stri_locate_all('I want to learn R to promote my statistical skills',fixed = 'to')

    [[1]]

         start end

    [1,]     8   9

    [2,]    19  20  #返回的是位置,起始和结束,可用于提取

  • 相关阅读:
    OCP-1Z0-053-V12.02-515题
    OCP-1Z0-053-V12.02-605题
    OCP-1Z0-053-V12.02-648题
    OCP-1Z0-053-V12.02-669题
    OCP-1Z0-053-V12.02-83题
    OCP-1Z0-053-V12.02-215题
    OCP-1Z0-053-V12.02-514题
    OCP-1Z0-053-V12.02-666题
    OCP-1Z0-053-V12.02-602题
    Oracle DB执行闪回数据库
  • 原文地址:https://www.cnblogs.com/ppjs/p/9439159.html
Copyright © 2020-2023  润新知