从hdfs里获取希望的数据:
import subprocess for day in range(22, 23): for h in range(17, 24): filename = "metadata-2018-10-%02d-%02d.txt" % (day, h) cmd = "hdfs dfs -text /flume/metadata/2018/10/%02d/%02d/*.snappy" % (day, h) print(cmd) #cmd = "cat *.py" cmd = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) f = open(filename, "w") for line in cmd.stdout: try: arr = line.split("^") if len(line) > 100 and arr[6] == "6": #print(line) f.write(" ".join(arr[:32]) + " " + arr[95] + " ") except Exception as e: print(e, "fuck error", line) f.close() #import sys #sys.exit(0)