[Mr. Zhao Qiang] Flink’s dataset operator

Time:2021-9-18

[Mr. Zhao Qiang] Flink's dataset operator

Flink provides corresponding dataset APIs and datastream APIs to handle bounded and unbounded datasets. We can develop corresponding Java programs or Scala programs to complete the corresponding functions. The following examples illustrate some basic operators in the dataset API.

[Mr. Zhao Qiang] Flink's dataset operator

Next, we will demonstrate the role of each operator through specific code.

1. Map, flatmap and mappartition

//Get running environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

ArrayList<String> data = new ArrayList<String>();
data.add("I love Beijing");
data.add("I love China");
data.add("Beijing is the capital of China");
DataSource<String> text = env.fromCollection(data);

DataSet<List<String>> mapData = text.map(new MapFunction<String, List<String>>() {

    public List<String> map(String data) throws Exception {
        String[] words = data.split(" ");
        
        //Create a list
        List<String> result = new ArrayList<String>();
        for(String w:words){
            result.add(w);
        }
        return result;
    }
});
mapData.print();
System.out.println("*****************************************");

DataSet<String> flatMapData = text.flatMap(new FlatMapFunction<String, String>() {

    public void flatMap(String data, Collector<String> collection) throws Exception {
        String[] words = data.split(" ");
        for(String w:words){
            collection.collect(w);
        }
    }
});
flatMapData.print();

System.out.println("*****************************************");
/*    new MapPartitionFunction<String, String>
    The first string: indicates the type of data element in the partition
    The second string: indicates the data element type after processing*/
DataSet<String> mapPartitionData = text.mapPartition(new MapPartitionFunction<String, String>() {

    public void mapPartition(Iterable<String> values, Collector<String> out) throws Exception {
        //The advantages of partition operation are: for example, to operate the database, a partition only needs to create a connection
        //Values stores the data of a partition
         Iterator<String> it = values.iterator();
        while (it.hasNext()) {
            String next = it.next();
            String[] split = next.split(" ");
            for (String word : split) {
                out.collect(word);
            }
        }
        //Close link
    }
});
mapPartitionData.print();

2. Filter and distinct

//Get running environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

ArrayList<String> data = new ArrayList<String>();
data.add("I love Beijing");
data.add("I love China");
data.add("Beijing is the capital of China");
DataSource<String> text = env.fromCollection(data);

DataSet<String> flatMapData = text.flatMap(new FlatMapFunction<String, String>() {

    public void flatMap(String data, Collector<String> collection) throws Exception {
        String[] words = data.split(" ");
        for(String w:words){
            collection.collect(w);
        }
    }
});

//Remove duplicate words
flatMapData.distinct().print();
System.out.println("*********************");

//Choose words longer than 3
flatMapData.filter(new FilterFunction<String>() {
    
    public boolean filter(String word) throws Exception {
        int length = word.length();
        return length>3?true:false;
    }
}).print();

3. Join operation

//Get the running environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

//Create the first table: user ID name
ArrayList<Tuple2<Integer, String>> data1 = new ArrayList<Tuple2<Integer,String>>();
data1.add(new Tuple2(1,"Tom"));
data1.add(new Tuple2(2,"Mike"));
data1.add(new Tuple2(3,"Mary"));
data1.add(new Tuple2(4,"Jone"));
//Create the second table: the city where the user ID is located
ArrayList<Tuple2<Integer, String>> data2 = new ArrayList<Tuple2<Integer,String>>();
Data2. Add (New tuple2 (1, "Beijing");
Data2. Add (New tuple2 (2, "Shanghai");
Data2. Add (New tuple2 (3, "Guangzhou");
Data2. Add (New tuple2 (4, "Chongqing");

//Realize multi table query of join: the program where the user ID and name are located
DataSet<Tuple2<Integer, String>> table1 = env.fromCollection(data1);
DataSet<Tuple2<Integer, String>> table2 = env.fromCollection(data2);

table1.join(table2).where(0).equalTo(0)
/*The first tuple2 < integer, string >: indicates the first table
 *The second tuple2 < integer, string >: indicates the second table
 *Tuple3 < integer, string, string >: the returned result of multi table join query*/                           
.with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String, String>>() {
    public Tuple3<Integer, String, String> join(Tuple2<Integer, String> table1,
            Tuple2<Integer, String> table2) throws Exception {
        return new Tuple3<Integer, String, String>(table1.f0,table1.f1,table2.f1);
    } }).print();

4. Cartesian product

ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

//Create the first table: user ID name
ArrayList<Tuple2<Integer, String>> data1 = new ArrayList<Tuple2<Integer,String>>();
data1.add(new Tuple2(1,"Tom"));
data1.add(new Tuple2(2,"Mike"));
data1.add(new Tuple2(3,"Mary"));
data1.add(new Tuple2(4,"Jone"));

//Create the second table: the city where the user ID is located
ArrayList<Tuple2<Integer, String>> data2 = new ArrayList<Tuple2<Integer,String>>();
Data2. Add (New tuple2 (1, "Beijing");
Data2. Add (New tuple2 (2, "Shanghai");
Data2. Add (New tuple2 (3, "Guangzhou");
Data2. Add (New tuple2 (4, "Chongqing");

//Realize multi table query of join: the program where the user ID and name are located
DataSet<Tuple2<Integer, String>> table1 = env.fromCollection(data1);
DataSet<Tuple2<Integer, String>> table2 = env.fromCollection(data2);

//Generate Cartesian product
table1.cross(table2).print();

5、First-N

ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

//The data here are: employee name, salary and department number
DataSet<Tuple3<String, Integer,Integer>> grade = 
        env.fromElements(new Tuple3<String, Integer,Integer>("Tom",1000,10),
                         new Tuple3<String, Integer,Integer>("Mary",1500,20),
                         new Tuple3<String, Integer,Integer>("Mike",1200,30),
                         new Tuple3<String, Integer,Integer>("Jerry",2000,10));

//Take the first three records according to the insertion sequence
grade.first(3).print();
System.out.println("**********************");

//Sort by department number first, then by salary
grade.sortPartition(2, Order.ASCENDING).sortPartition(1, Order.ASCENDING).print();
System.out.println("**********************");

//Group according to the department number and find the first record of each group
grade.groupBy(2).first(1).print();

6. External link operation

ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

//Create the first table: user ID name
ArrayList<Tuple2<Integer, String>> data1 = new ArrayList<Tuple2<Integer,String>>();
data1.add(new Tuple2(1,"Tom"));
data1.add(new Tuple2(3,"Mary"));
data1.add(new Tuple2(4,"Jone"));

//Create the second table: the city where the user ID is located
ArrayList<Tuple2<Integer, String>> data2 = new ArrayList<Tuple2<Integer,String>>();
Data2. Add (New tuple2 (1, "Beijing");
Data2. Add (New tuple2 (2, "Shanghai");
Data2. Add (New tuple2 (4, "Chongqing");

//Realize multi table query of join: the program where the user ID and name are located
DataSet<Tuple2<Integer, String>> table1 = env.fromCollection(data1);
DataSet<Tuple2<Integer, String>> table2 = env.fromCollection(data2);

//Left outer connection
table1.leftOuterJoin(table2).where(0).equalTo(0)
      .with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {

        public Tuple3<Integer, String, String> join(Tuple2<Integer, String> table1,
                Tuple2<Integer, String> table2) throws Exception {
            //The left outer connection indicates that the information to the left of the equal sign will be included
            if(table2 == null){
                return new Tuple3<Integer, String, String>(table1.f0,table1.f1,null);
            }else{
                return new Tuple3<Integer, String, String>(table1.f0,table1.f1,table2.f1);
            }
        }
    }).print();

System.out.println("***********************************");
//Right outer connection
table1.rightOuterJoin(table2).where(0).equalTo(0)
      .with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {

        public Tuple3<Integer, String, String> join(Tuple2<Integer, String> table1,
                Tuple2<Integer, String> table2) throws Exception {
            //The right outer link indicates that the information of the table to the right of the equal sign will be included
            if(table1 == null){
                return new Tuple3<Integer, String, String>(table2.f0,null,table2.f1);
            }else{
                return new Tuple3<Integer, String, String>(table2.f0,table1.f1,table2.f1);
            }
        }
    }).print();

System.out.println("***********************************");

//Total external connection
table1.fullOuterJoin(table2).where(0).equalTo(0)
.with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {

    public Tuple3<Integer, String, String> join(Tuple2<Integer, String> table1, Tuple2<Integer, String> table2)
            throws Exception {
        if(table1 == null){
            return new Tuple3<Integer, String, String>(table2.f0,null,table2.f1);
        }else if(table2 == null){
            return new Tuple3<Integer, String, String>(table1.f0,table1.f1,null);
        }else{
            return new Tuple3<Integer, String, String>(table1.f0,table1.f1,table2.f1);
        }
    }
    
}).print();

[Mr. Zhao Qiang] Flink's dataset operator