• 基于Java使用Flink读取CSV文件,针对批处理,多表联合两种方式Table类和Join方法的实现数据处理,再入CSV文件


    Maven依赖

    源头

    <dependencies>
    	<dependency>
    		<groupId>org.projectlombok</groupId>
    		<artifactId>lombok</artifactId>
    		<version>1.18.8</version>
    	</dependency>
    
    	<dependency>
    		<groupId>org.apache.flink</groupId>
    		<artifactId>flink-table-planner_2.11</artifactId>
    		<version>1.8.0</version>
    	</dependency>
    
    	<dependency>
    		<groupId>org.apache.flink</groupId>
    		<artifactId>flink-table-api-java-bridge_2.11</artifactId>
    		<version>1.8.0</version>
    	</dependency>
    
    	<dependency>
    		<groupId>org.apache.flink</groupId>
    		<artifactId>flink-streaming-scala_2.11</artifactId>
    		<version>1.8.0</version>
    	</dependency>
    
    	<dependency>
    		<groupId>org.apache.flink</groupId>
    		<artifactId>flink-table-common</artifactId>
    		<version>1.8.0</version>
    	</dependency>
    </dependencies>
    

    改版

        <dependencies>
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <version>1.18.8</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.flink</groupId>
                <artifactId>flink-table_2.11</artifactId>
                <version>1.7.2</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.flink</groupId>
                <artifactId>flink-streaming-scala_2.11</artifactId>
                <version>1.8.0</version>
            </dependency>
        </dependencies>
    

    SQL语句

    SELECT COUNT(*) FROM T13_REF_AIRPORT_SAT;--11008
    --HUB_ID IATA_CD NAME_CN NAME_EN
    SELECT COUNT(*) FROM T13_REF_AIRPORT_CITY_LINK;--9676
    --*******LINK_ID AIRPORT_HUB_ID CITY_HUB_ID
    SELECT COUNT(*) FROM T13_REF_CITY_SAT;--9624
    --HUB_ID CITY_CD NAME_CN NAME_EN
    SELECT COUNT(*) FROM T13_REF_CITY_COUNTRY_LINK;--9062
    --*******LINK_ID COUNTRY_HUB_ID CITY_HUB_ID
    SELECT COUNT(*) FROM T13_REF_COUNTRY_SAT;--356
    --HUB_ID  COUNTRY_CD NAME_CN NAME_EN
    
    SELECT * 
    	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
    		AND X2.CITY_HUB_ID=X3.HUB_ID
    		AND X3.HUB_ID=X4.CITY_HUB_ID
    		AND X4.COUNTRY_HUB_ID=X5.HUB_ID;
    
    SELECT COUNT(*) 
    	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
    		AND X2.CITY_HUB_ID=X3.HUB_ID
    		AND X3.HUB_ID=X4.CITY_HUB_ID
    		AND X4.COUNTRY_HUB_ID=X5.HUB_ID;--16759
    
    SELECT X5.NAME_CN COUNTRY_CN_NAME,COUNT(X1.HUB_ID) COUNT_AIRPORT
    	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
    		AND X2.CITY_HUB_ID=X3.HUB_ID
    		AND X3.HUB_ID=X4.CITY_HUB_ID
    		AND X4.COUNTRY_HUB_ID=X5.HUB_ID
    	GROUP BY X5.NAME_CN
    	ORDER BY COUNT_AIRPORT DESC;--254
    
    SELECT 
    	X5.COUNTRY_CD,
    	X5.NAME_CN COUNTRY_NAME_CN,
    	X5.NAME_EN COUNTRY_NAME_EN,
    	X3.CITY_CD,
    	X3.NAME_CN CITY_CN_NAME,
    	X3.NAME_EN CITY_EN_NAME,
    COUNT(X1.HUB_ID) COUNT_AIRPORT
    	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
    		AND X2.CITY_HUB_ID=X3.HUB_ID
    		AND X3.HUB_ID=X4.CITY_HUB_ID
    		AND X4.COUNTRY_HUB_ID=X5.HUB_ID
    	GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
    	ORDER BY COUNT_AIRPORT DESC;--13030
    
    SELECT 
    		X5.COUNTRY_CD,
    		X5.NAME_CN COUNTRY_NAME_CN,
    		X5.NAME_EN COUNTRY_NAME_EN,
    		X3.CITY_CD,
    		X3.NAME_CN CITY_CN_NAME,
    		X3.NAME_EN CITY_EN_NAME,
    		COUNT(X1.HUB_ID) COUNT_AIRPORT
    	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
    		AND X2.CITY_HUB_ID=X3.HUB_ID
    		AND X3.HUB_ID=X4.CITY_HUB_ID
    		AND X4.COUNTRY_HUB_ID=X5.HUB_ID
    		AND X3.NAME_EN IS NULL
    	GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
    	ORDER BY COUNT_AIRPORT DESC;
    
    --COUNTRY_NAME_EN=NULL 19
    --CITY_CN_NAME=NULL 1
    --CITY_EN_NAME=NULL 1501
    

    Airport_Sat

    import lombok.Data;
    
    @Data
    public class AirportSat
    {
        private String hub_id;
    }
    

    Airport_City_Link

    import lombok.Data;
    
    @Data
    public class AirportCityLink
    {
        private String airport_hub_id;
        private String city_hub_id;
    }
    

    City_Sat

    import lombok.Data;
    
    @Data
    public class CitySat
    {
        private String hub_id;
        private String city_cd;
        private String name_cn;
        private String name_en;
    }
    

    City_Country_Link

    import lombok.Data;
    
    @Data
    public class CityCountryLink
    {
        private String country_hub_id;
        private String city_hub_id;
    }
    

    Country_Sat

    import lombok.Data;
    
    @Data
    public class CountrySat
    {
        private String hub_id;
        private String country_cd;
        private String name_cn;
        private String name_en;
    }
    

    Flink_Csv

    点击查看Flink_Csv代码
    import org.apache.flink.api.common.functions.MapFunction;
    import org.apache.flink.api.common.operators.Order;
    import org.apache.flink.api.common.typeinfo.TypeHint;
    import org.apache.flink.api.common.typeinfo.TypeInformation;
    import org.apache.flink.api.java.DataSet;
    import org.apache.flink.api.java.ExecutionEnvironment;
    import org.apache.flink.api.java.functions.KeySelector;
    import org.apache.flink.api.java.operators.MapOperator;
    import org.apache.flink.api.java.operators.SortPartitionOperator;
    import org.apache.flink.api.java.tuple.Tuple1;
    import org.apache.flink.api.java.tuple.Tuple2;
    import org.apache.flink.api.java.tuple.Tuple7;
    import org.apache.flink.core.fs.FileSystem;
    import org.apache.flink.table.api.Table;
    import org.apache.flink.table.api.java.BatchTableEnvironment;
    
    import java.text.SimpleDateFormat;
    import java.util.Date;
    
    public class FlinkCsv
    {
        public static void main(String[] args) throws Exception
        {
    		long s4 = System.currentTimeMillis();
    		t4();
    		System.out.println((System.currentTimeMillis() - s4) + "u");
    		long s5 = System.currentTimeMillis();
    		t5();
    		System.out.println((System.currentTimeMillis() - s5) + "d");
        }
    
        private static void t5() throws Exception
        {
            ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
            env.setParallelism(1);
            BatchTableEnvironment table_env = BatchTableEnvironment.getTableEnvironment(env);
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");
    
            DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\T13_REF_AIRPORT_SAT.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
                    .pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);
    
            DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\T13_REF_AIRPORT_CITY_LINK.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                    .pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");
    
            DataSet<CitySat> data_citysat = env.readCsvFile("D:\T13_REF_CITY_SAT.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
                    .pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");
    
            DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\T13_REF_CITY_COUNTRY_LINK.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                    .pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");
    
            DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\T13_REF_COUNTRY_SAT.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
                    .pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");
    
            table_env.registerTable("t13_ref_airport_sat", table_env.fromDataSet(data_airportsat));
            table_env.registerTable("t13_ref_airport_city_link", table_env.fromDataSet(data_airportcitylink));
            table_env.registerTable("t13_ref_city_sat", table_env.fromDataSet(data_citysat));
            table_env.registerTable("t13_ref_city_country_link", table_env.fromDataSet(data_citycountrylink));
            table_env.registerTable("t13_ref_country_sat", table_env.fromDataSet(data_countrysat));
    
    
            String sql = "select count(*) 
    " +
                    "	from t13_ref_airport_sat x1,t13_ref_airport_city_link x2,
    " +
                    "	t13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5
    " +
                    "	where x1.hub_id=x2.airport_hub_id
    " +
                    "		and x2.city_hub_id=x3.hub_id
    " +
                    "		and x3.hub_id=x4.city_hub_id
    " +
                    "		and x4.country_hub_id=x5.hub_id";
    
            String sql_country = "select x5.name_cn country_cn_name,count(x1.hub_id) count_airport
    " +
                    "	from t13_ref_airport_sat x1,t13_ref_airport_city_link x2,
    " +
                    "	t13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5
    " +
                    "	where x1.hub_id=x2.airport_hub_id
    " +
                    "		and x2.city_hub_id=x3.hub_id
    " +
                    "		and x3.hub_id=x4.city_hub_id
    " +
                    "		and x4.country_hub_id=x5.hub_id
    " +
                    "	group by x5.name_cn
    " +
                    "	order by count_airport desc";
    
            String sql_all = "select 
    " +
                    "	x5.country_cd,
    " +
                    "	x5.name_cn country_name_cn,
    " +
                    "	x5.name_en country_name_en,
    " +
                    "	x3.city_cd,
    " +
                    "	x3.name_cn city_cn_name,
    " +
                    "	x3.name_en city_en_name,
    " +
                    "count(x1.hub_id) count_airport
    " +
                    "	from t13_ref_airport_sat x1,t13_ref_airport_city_link x2,t13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5
    " +
                    "	where x1.hub_id=x2.airport_hub_id
    " +
                    "		and x2.city_hub_id=x3.hub_id
    " +
                    "		and x3.hub_id=x4.city_hub_id
    " +
                    "		and x4.country_hub_id=x5.hub_id
    " +
                    "	group by x5.country_cd,x5.name_cn,x5.name_en,x3.city_cd,x3.name_cn,x3.name_en
    " +
                    "	order by count_airport desc";
    
    
            DataSet<Tuple1<Long>> map = table_env.toDataSet(table_env.sqlQuery(sql),
                    TypeInformation.of(new TypeHint<Tuple1<Long>>()
                    {
                    }));
            map.print();
    
            DataSet<Tuple2<String, Long>> map_country = table_env.toDataSet(table_env.sqlQuery(sql_country),
                    TypeInformation.of(new TypeHint<Tuple2<String, Long>>()
                    {
                    }));
            System.out.println(map_country.count());
            map_country.print();
    
            Table result_country = table_env.sqlQuery(sql_country);
            DataSet<Tuple7<String, String, String, String, String, String, Long>> map_all = table_env.toDataSet(table_env.sqlQuery(sql_all),
                    TypeInformation.of(new TypeHint<Tuple7<String, String, String, String, String, String, Long>>()
                    {
                    }));
            System.out.println(map_all.count());
            map_all.print();
    
            map.writeAsCsv("D:\Flink_CSV\" + sdf.format(new Date()) + "______map.csv",
                    "
    ", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
            System.out.println("T打印完成______map...");
            map_country.writeAsCsv("D:\Flink_CSV\" + sdf.format(new Date()) + "______map_country.csv",
                    "
    ", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
            System.out.println("T打印完成______map_country...");
            map_all.writeAsCsv("D:\Flink_CSV\" + sdf.format(new Date()) + "______map_all.csv",
                    "
    ", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
            System.out.println("T打印完成______map_all...");
    
            env.execute("Hello!@ Fuck...");
        }
    
        private static void t4() throws Exception
        {
            ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
            env.setParallelism(1);
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");
    
            DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\T13_REF_AIRPORT_SAT.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
                    .pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);
    
            DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\T13_REF_AIRPORT_CITY_LINK.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                    .pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");
    
            DataSet<CitySat> data_citysat = env.readCsvFile("D:\T13_REF_CITY_SAT.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
                    .pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");
    
            DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\T13_REF_CITY_COUNTRY_LINK.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                    .pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");
    
            DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\T13_REF_COUNTRY_SAT.csv")
                    .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
                    .pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");
    
            MapOperator<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
                    Tuple7<String, String, String, String, String, String, Long>> map = data_airportsat
                    .join(data_airportcitylink).where("hub_id").equalTo("airport_hub_id")
                    .join(data_citysat).where(new KeySelector<Tuple2<AirportSat, AirportCityLink>, String>()
                    {
                        @Override
                        public String getKey(Tuple2<AirportSat, AirportCityLink> t) throws Exception
                        {
                            return t.f1.getCity_hub_id();
                        }
                    }).equalTo("hub_id")
                    .join(data_citycountrylink).where(new KeySelector<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, String>()
                    {
                        @Override
                        public String getKey(Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat> t) throws Exception
                        {
                            return t.f1.getHub_id();
                        }
                    }).equalTo("city_hub_id")
                    .join(data_countrysat).where(new KeySelector<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, String>()
                    {
                        @Override
                        public String getKey(Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink> t) throws Exception
                        {
                            return t.f1.getCountry_hub_id();
                        }
                    }).equalTo("hub_id")
                    .map(new MapFunction<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
                            Tuple7<String, String, String, String, String, String, Long>>()
                    {
    
                        @Override
                        public Tuple7<String, String, String, String, String, String, Long> map(
                                Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat> t) throws Exception
                        {
                            String country_cd = t.f1.getCountry_cd();
                            String country_cn_name = t.f1.getName_cn();
                            String country_en_name = t.f1.getName_en();
                            String city_cd = t.f0.f0.f1.getCity_cd();
                            String city_cn_name = t.f0.f0.f1.getName_cn();
                            String city_en_name = t.f0.f0.f1.getName_en();
                            long airport = 1L;
                            return new Tuple7<>(country_cd, country_cn_name, country_en_name, city_cd, city_cn_name, city_en_name, airport);
                        }
                    });
            //--------------------------------------------------------------------------------------------------------------
            System.out.println("总数量: " + map.count());
            SortPartitionOperator<Tuple2<String, Long>> map_country = map
                    .map(new MapFunction<Tuple7<String, String, String, String, String, String, Long>, Tuple2<String, Long>>()
                    {
                        @Override
                        public Tuple2<String, Long> map(Tuple7<String, String, String, String, String, String, Long> t) throws Exception
                        {
                            return new Tuple2<>(t.f1, t.f6);
                        }
                    }).groupBy(0).sum(1).sortPartition(1, Order.DESCENDING);
            System.out.println("国家分总数量: " + map_country.count());
    		//map_country.print();
            SortPartitionOperator<Tuple7<String, String, String, String, String, String, Long>> map_all = map
                    .groupBy(0, 1, 2, 3, 4, 5).sum(6).sortPartition(6, Order.DESCENDING);
            System.out.println("全分总数量: " + map_all.count());
    		//map_all.print();
    
    
            map.writeAsCsv("D:\Flink_CSV\" + sdf.format(new Date()) + "______map.csv",
                    "
    ", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
            System.out.println("打印完成______map...");
            map_country.writeAsCsv("D:\Flink_CSV\" + sdf.format(new Date()) + "______map_country.csv",
                    "
    ", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
            System.out.println("打印完成______map_country...");
            map_all.writeAsCsv("D:\Flink_CSV\" + sdf.format(new Date()) + "______map_all.csv",
                    "
    ", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
            System.out.println("打印完成______map_all...");
    
            env.execute("Hello!@ Fuck...");
        }
    }
    
    
  • 相关阅读:
    深度学习-Tensorflow2.2-深度学习基础和tf.keras{1}-softmax多分类-06
    深度学习-Tensorflow2.2-深度学习基础和tf.keras{1}-逻辑回归与交叉熵概述-05
    深度学习-Tensorflow2.2-深度学习基础和tf.keras{1}-多层感知器(神经网络)与激活函数概述-04
    深度学习-Tensorflow2.2-深度学习基础和tf.keras{1}-梯度下降算法概述-03
    深度学习-Tensorflow2.2-深度学习基础和tf.keras{1}-线性回归tf.keras概述-02
    深度学习-Tensorflow2.2-深度学习基础和tf.keras{1}-Tensorflow2.2-cpu/gpu环境安装-01
    深度学习-线性回归基础-02
    艾宾浩斯复习
    maven中设置jdk默认编译版本为1.8
    maven 编译命令
  • 原文地址:https://www.cnblogs.com/taopanfeng/p/11950264.html
Copyright © 2020-2023  润新知