• 《利用python进行数据分析》学习之文件加载、写入


     

    csv文件读取

    In [1]:
    import pandas as pd
    import sys
    
    In [2]:
    %cat examples/ex2.csv
    
     
    1,2,3,4,hello
    5,6,7,8,world
    9,10,11,12,foo
    In [3]:
    #文件没有标签数据
    pd.read_csv('examples/ex2.csv',header=None)
    
    Out[3]:
     
     01234
    0 1 2 3 4 hello
    1 5 6 7 8 world
    2 9 10 11 12 foo
    In [4]:
    pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'])
    
    Out[4]:
     
     abcdmassage
    0 1 2 3 4 hello
    1 5 6 7 8 world
    2 9 10 11 12 foo
    In [5]:
    #将其中的一列设为索引列
    pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'],index_col='massage')
    
    Out[5]:
     
     abcd
    massage    
    hello 1 2 3 4
    world 5 6 7 8
    foo 9 10 11 12
    In [6]:
    list(open('examples/ex3.txt'))
    
    Out[6]:
    ['            A         B         C
    ',
     'aaa -0.264438 -1.026059 -0.619500
    ',
     'bbb  0.927272  0.302904 -0.032399
    ',
     'ccc -0.264273 -0.386314 -0.217601
    ',
     'ddd -0.871858 -0.348382  1.100491
    ']
    In [7]:
    #多出的一列数据自动识别为索引,分隔符不同使用正则表达式
    pd.read_csv('examples/ex3.csv',sep='s+')
    
    Out[7]:
     
     ABC
    aaa -0.264438 -1.026059 -0.619500
    bbb 0.927272 0.302904 -0.032399
    ccc -0.264273 -0.386314 -0.217601
    ddd -0.871858 -0.348382 1.100491
     

    将读取的非空数据设为NaN

    In [8]:
    %cat examples/ex5.csv
    
     
    something,a,b,c,d,message
    one,1,2,3,4,NA
    two,5,6,,8,world
    three,9,10,11,12,foo
    In [9]:
    pd.read_csv('examples/ex5.csv',na_values={'something':'two','massage':['NA','foo']})
    
    Out[9]:
     
     somethingabcdmessage
    0 one 1 2 3.0 4 NaN
    1 NaN 5 6 NaN 8 world
    2 three 9 10 11.0 12 foo
    In [10]:
    #只读取一部分数据
    pd.read_csv('examples/ex6.csv',nrows=10)
    
    Out[10]:
     
     onetwothreefourkey
    0 0.467976 -0.038649 -0.295344 -1.824726 L
    1 -0.358893 1.404453 0.704965 -0.200638 B
    2 -0.501840 0.659254 -0.421691 -0.057688 G
    3 0.204886 1.074134 1.388361 -0.982404 R
    4 0.354628 -0.133116 0.283763 -0.837063 Q
    5 1.817480 0.742273 0.419395 -2.251035 Q
    6 -0.776764 0.935518 -0.332872 -1.875641 U
    7 -0.913135 1.530624 -0.572657 0.477252 K
    8 0.358480 -0.497572 -0.367016 0.507702 S
    9 -1.740877 -1.160417 -1.637830 2.172201 G
    In [11]:
    #将数据分块读取
    chunker = pd.read_csv('examples/ex6.csv',chunksize=1000)
    for piece in chunker:
        print(piece.iloc[0])
    
     
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              L
    Name: 0, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              T
    Name: 1000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              1
    Name: 2000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              H
    Name: 3000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              H
    Name: 4000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              1
    Name: 5000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              I
    Name: 6000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              1
    Name: 7000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              7
    Name: 8000, dtype: object
    one       0.467976
    two     -0.0386485
    three    -0.295344
    four      -1.82473
    key              B
    Name: 9000, dtype: object
    
     

    写入csv文件

    In [12]:
    data = pd.read_csv('examples/ex5.csv')
    data.to_csv(sys.stdout)
    
     
    ,something,a,b,c,d,message
    0,one,1,2,3.0,4,
    1,two,5,6,,8,world
    2,three,9,10,11.0,12,foo
    
    In [13]:
    data.to_csv(sys.stdout,sep='|')
    
     
    |something|a|b|c|d|message
    0|one|1|2|3.0|4|
    1|two|5|6||8|world
    2|three|9|10|11.0|12|foo
    
    In [14]:
    #对缺失值进行标识
    data.to_csv(sys.stdout,na_rep='NULL')
    
     
    ,something,a,b,c,d,message
    0,one,1,2,3.0,4,NULL
    1,two,5,6,NULL,8,world
    2,three,9,10,11.0,12,foo
    
    In [15]:
    data.to_csv(sys.stdout,index=False,header=False)
    
     
    one,1,2,3.0,4,
    two,5,6,,8,world
    three,9,10,11.0,12,foo
    
    In [16]:
    #只写入子集
    data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
    
     
    a,b,c
    1,2,3.0
    5,6,
    9,10,11.0
    
     

    json文件

    In [17]:
    %cat examples/example.json
    
     
    [{"a": 1, "b": 2, "c": 3},
     {"a": 4, "b": 5, "c": 6},
     {"a": 7, "b": 8, "c": 9}]
    
    In [18]:
    data = pd.read_json('examples/example.json')
    data
    
    Out[18]:
     
     abc
    0 1 2 3
    1 4 5 6
    2 7 8 9
    In [19]:
    data.to_json(sys.stdout)
    
     
    {"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
    In [20]:
    #按行写入
    data.to_json(sys.stdout,orient='records')
    
     
    [{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
     

    HTML

    In [21]:
    #搜索并解析包含在table标签中的数据
    tables = pd.read_html('examples/fdic_failed_bank_list.html')
    #只有一张表格
    len(tables)
    
    Out[21]:
    1
    In [22]:
    data = tables[0]
    data.head()
    
    Out[22]:
     
     Bank NameCitySTCERTAcquiring InstitutionClosing DateUpdated Date
    0 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 November 17, 2016
    1 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 November 17, 2016
    2 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
    3 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
    4 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 June 16, 2016
     

    excel

    In [23]:
    data = pd.read_excel('examples/ex1.xlsx','Sheet1')
    data
    
    Out[23]:
     
     Unnamed: 0abcdmessage
    0 0 1 2 3 4 hello
    1 1 5 6 7 8 world
    2 2 9 10 11 12 foo
    In [24]:
    data.to_excel('examples/ex2.xlsx')
    
     

    Web API

    In [25]:
    import requests
    url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
    resp = requests.get(url)
    resp
    
    Out[25]:
    <Response [200]>
    In [26]:
    data = resp.json()#data为字典数组
    issues = pd.DataFrame(data,columns=['title','url','state','labels'])#提取部分字段
    issues.head()
    
    Out[26]:
     
     titleurlstatelabels
    0 Resample on MultiIndex level is much longer th... https://api.github.com/repos/pandas-dev/pandas... open []
    1 BUG: value_counts can handle the case even wit... https://api.github.com/repos/pandas-dev/pandas... open []
    2 Build error on Arch linux https://api.github.com/repos/pandas-dev/pandas... open []
    3 BUG: DataFrame.to_html validates formatters ha... https://api.github.com/repos/pandas-dev/pandas... open []
    4 BUG DataFream.rindex方法参数的错误 https://api.github.com/repos/pandas-dev/pandas... open [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...
  • 相关阅读:
    BZOJ 3236 AHOI 2013 作业 莫队算法
    使用再哈希算法查找元素
    冰雪奇缘--用爱酿就一部经典
    Scrapy系列教程(2)------Item(结构化数据存储结构)
    html学习笔记二
    知方可补不足~sqlserver中触发器的使用
    我心中的核心组件(可插拔的AOP)~调度组件quartz.net
    EF架构~为BulkInsert引入SET IDENTITY_INSERT ON功能
    知方可补不足~sqlserver中使用sp_who查看sql的进程
    知方可补不足~为千万级数据表加索引
  • 原文地址:https://www.cnblogs.com/FinnChan/p/11604881.html
Copyright © 2020-2023  润新知