import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; public class FileSplit { public static void main(String[] args) throws IOException { long timer = System.currentTimeMillis(); int bufferSize = 20 * 1024 * 1024;//设读取文件的缓存为20MB //建立缓冲文本输入流 File file = new File("/media/Data/毕业设计/kdd cup/数据/userid_profile.txt"); FileInputStream fileInputStream = new FileInputStream(file); BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream); InputStreamReader inputStreamReader = new InputStreamReader(bufferedInputStream); BufferedReader input = new BufferedReader(inputStreamReader, bufferSize); int splitNum = 112-1;//要分割的块数减一 int fileLines = 23669283;//输入文件的行数 long perSplitLines = fileLines / splitNum;//每个块的行数 for (int i = 0; i <= splitNum; ++i) { //分割 //每个块建立一个输出 FileWriter output = new FileWriter("/home/haoqiong/part" + i + ".txt"); String line = null; //逐行读取,逐行输出 for (long lineCounter = 0; lineCounter < perSplitLines && (line = input.readLine()) != null; ++lineCounter) { output.append(line + " "); } output.flush(); output.close(); output = null; } input.close(); timer = System.currentTimeMillis() - timer; System.out.println("处理时间:" + timer); } }
方法2
void largeFileIO(String inputFile, String outputFile) { try { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(new File(inputFile))); BufferedReader in = new BufferedReader(new InputStreamReader(bis, "utf-8"), 10 * 1024 * 1024);//10M缓存 FileWriter fw = new FileWriter(outputFile); while (in.ready()) { String line = in.readLine(); fw.append(line + " "); } in.close(); fw.flush(); fw.close(); } catch (IOException ex) { ex.printStackTrace(); }
//NIO 读写
package com.netty.demo1.vera.demo; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.List; interface Callback { void action(String line); } /** * Hello world! * */ public class NIOReadline { public static void main(String[] args) throws IOException { int bufSize = 1000000;//一次读取的字节长度 File fin = new File("C:\Devs\abc.txt");//读取的文件 File fout = new File("C:\Devs\out.txt");//写出的文件 Date startDate = new Date(); FileChannel fcin = new RandomAccessFile(fin, "r").getChannel(); ByteBuffer rBuffer = ByteBuffer.allocate(bufSize); FileChannel fcout = new RandomAccessFile(fout, "rws").getChannel(); ByteBuffer wBuffer = ByteBuffer.allocateDirect(bufSize); readFileByLine(bufSize, fcin, rBuffer, fcout, wBuffer); Date endDate = new Date(); System.out.print(startDate+"|"+endDate);//测试执行时间 if(fcin.isOpen()){ fcin.close(); } if(fcout.isOpen()){ fcout.close(); } } public static void readFileByLine(int bufSize, FileChannel fcin, ByteBuffer rBuffer, FileChannel fcout, ByteBuffer wBuffer) { String enter = " "; List<String> dataList = new ArrayList<String>();//存储读取的每行数据 byte[] lineByte = new byte[0]; String encode = "GBK"; // String encode = "UTF-8"; try { //temp:由于是按固定字节读取,在一次读取中,第一行和最后一行经常是不完整的行,因此定义此变量来存储上次的最后一行和这次的第一行的内容, //并将之连接成完成的一行,否则会出现汉字被拆分成2个字节,并被提前转换成字符串而乱码的问题 byte[] temp = new byte[0]; while (fcin.read(rBuffer) != -1) {//fcin.read(rBuffer):从文件管道读取内容到缓冲区(rBuffer) int rSize = rBuffer.position();//读取结束后的位置,相当于读取的长度 byte[] bs = new byte[rSize];//用来存放读取的内容的数组 rBuffer.rewind();//将position设回0,所以你可以重读Buffer中的所有数据,此处如果不设置,无法使用下面的get方法 rBuffer.get(bs);//相当于rBuffer.get(bs,0,bs.length()):从position初始位置开始相对读,读bs.length个byte,并写入bs[0]到bs[bs.length-1]的区域 rBuffer.clear(); int startNum = 0; int LF = 10;//换行符 int CR = 13;//回车符 boolean hasLF = false;//是否有换行符 for(int i = 0; i < rSize; i++){ if(bs[i] == LF){ hasLF = true; int tempNum = temp.length; int lineNum = i - startNum; lineByte = new byte[tempNum + lineNum];//数组大小已经去掉换行符 System.arraycopy(temp, 0, lineByte, 0, tempNum);//填充了lineByte[0]~lineByte[tempNum-1] temp = new byte[0]; System.arraycopy(bs, startNum, lineByte, tempNum, lineNum);//填充lineByte[tempNum]~lineByte[tempNum+lineNum-1] String line = new String(lineByte, 0, lineByte.length, encode);//一行完整的字符串(过滤了换行和回车) dataList.add(line); // System.out.println(line); writeFileByLine(fcout, wBuffer, line + enter); //过滤回车符和换行符 if(i + 1 < rSize && bs[i + 1] == CR){ startNum = i + 2; }else{ startNum = i + 1; } } } if(hasLF){ temp = new byte[bs.length - startNum]; System.arraycopy(bs, startNum, temp, 0, temp.length); }else{//兼容单次读取的内容不足一行的情况 byte[] toTemp = new byte[temp.length + bs.length]; System.arraycopy(temp, 0, toTemp, 0, temp.length); System.arraycopy(bs, 0, toTemp, temp.length, bs.length); temp = toTemp; } } if(temp != null && temp.length > 0){//兼容文件最后一行没有换行的情况 String line = new String(temp, 0, temp.length, encode); dataList.add(line); // System.out.println(line); writeFileByLine(fcout, wBuffer, line + enter); } } catch (IOException e) { e.printStackTrace(); } } /** * 写到文件上 * @param fcout * @param wBuffer * @param line */ @SuppressWarnings("static-access") public static void writeFileByLine(FileChannel fcout, ByteBuffer wBuffer, String line) { try { fcout.write(wBuffer.wrap(line.getBytes("UTF-8")), fcout.size()); } catch (IOException e) { e.printStackTrace(); } } }