Java reads the contents of large files to elasticsearch analysis (teach you how to handle large CSV files in Java)

Time:2022-1-15
Now you need to quickly analyze a 2G CSV file;
Based on the acquired knowledge, use java to read files by line and import data into elasticsearch in batches,
Then use es powerful aggregation ability to analyze data, and finish it in an hour!
package com.example.demo;

import com.alibaba.fastjson.JSON;
import com.example.demo.entity.Entity;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;

/**
 *Read large files
 *CSV format
 *
 * @author lhb
 * @date 2021/11/11
 * @since 1.0.0
 */
@SpringBootTest
public class ImportTest {

    @Autowired
    @Qualifier("client")
    private RestHighLevelClient restHighLevelClient;

    @Test
    void insert() {
     //CSV file 2G, 63w pieces of data, more than ten fields
        String filePath = "D:\\file\211111.csv";

        LineIterator it = null;
        try {
            it = FileUtils.lineIterator(new File(filePath), "UTF-8");

        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            while (it.hasNext()) {
                String line = it.nextLine();
                //System.out.println("line = " + line);
                //The file is a CSV file. Each column in the CSV file is separated by "," so that the elements of each column can be obtained
                String[] strArray = line.split(",");
                //There is a long space. Trim it
                String name = strArray[6].trim();
                String code = strArray[8].trim();
                String num = strArray[11].trim();
                System.out.println(code + "==" + num);

                Entity entity = new Entity();
                entity.setCode(code);
                if (Objects.equals("xxx", code)) {
                    //Skip header
                    continue;
                }
                entity.setNum(Long.parseLong(num));
                entity.setName(name);
                entity.setCreateTime(new Date());
                String index = "index20211111";
                singleInsert2(index, entity);
            }
        } finally {
            LineIterator.closeQuietly(it);
        }
    }

    @Test
    void batchInsert() {

        String filePath = "D:\\express\211111.csv";

        LineIterator it = null;
        try {
            it = FileUtils.lineIterator(new File(filePath), "UTF-8");

        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            int i = 0;
            List entities = new ArrayList<>();

            while (it.hasNext()) {
                String line = it.nextLine();
                //System.out.println("line = " + line);
                String[] strArray = line.split(",");
                String code = strArray[6].trim();
                String name = strArray[8].trim();
                String num = strArray[11].trim();
                System.out.println(code + "==" + num);

                if (Objects.equals("xxx", code)) {
                    //Skip header
                    continue;
                }
                Entity entity = new Entity();
                entity.setCode(code);
                entity.setName(name);
                try {
                    entity.setNum(Long.parseLong(num));
                } catch (NumberFormatException e) {
                    e.printStackTrace();
                    System. out. Println ("wrong data" + code + "= =" + Num);
                }
                entity.setCreateTime(new Date());
                String index = "index20211111";

                //Batch insert
                entities.add(entity);
                i++; // if the last batch insertion is less than 10000 pieces of data, it needs to be processed according to the actual number of pieces
                if (i % 10000 == 0) {
                    System.out.println("i = " + i);
                    try {
                        batchInsert2(index, entities);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    //Empty the processed list
                    entities.clear();
                    i = 0;
                }
            }
        } finally {
            LineIterator.closeQuietly(it);
        }
    }

    /**
     *Batch speed bar
     *
     * @param index
     * @param entities
     * @throws IOException
     */
    public void batchInsert2(String index, List entities) throws IOException {
        BulkRequest bulkRequest = new BulkRequest(index);
        System.out.println("entities.sz = " + entities.size());
        for (Entity org : entities) {
            IndexRequest request = new IndexRequest();
            request.source(JSON.toJSONString(org), XContentType.JSON);
            bulkRequest.add(request);
        }
        restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
    }

    /**
     *Large amount of data, super slow
     *
     * @param index
     * @param entity
     */
    public void singleInsert2(String index, Entity entity) {
        IndexRequest request = new IndexRequest(index);
        request.source(JSON.toJSONString(entity), XContentType.JSON);
        try {
            IndexResponse index1 = restHighLevelClient.index(request, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
Entity class, what fields need to be customized
package com.example.demo.entity;

import lombok.Data;

import java.util.Date;

/**
 * @author lhb
 * @date 2021/11/11
 * @since 1.0.0
 */
@Data
public class Entity {

    /**
     *Code
     */
    private String code;
    /**
     *Name
     */
    private String name;
    /**
     *Quantity
     */
    private Long num;
    private Date createTime;

}
Create an index map and insert data:
PUT express_to_village20211104
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "code": {
        "type": "keyword"
      },
      "name": {
        "type": "keyword"
      },
      "num": {
        "type": "long"
      },
      "createTime": {
        "type": "date"
      }
    }
  }
}

Start analyzing data:

GET index20211111/_count
{}

 

#Return 63w data

{
“count” : 630000,
“_shards” : {
“total” : 1,
“successful” : 1,
“skipped” : 0,
“failed” : 0
}
}

GET index20211111/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "terms": {
          "code": [
            2222,
            1111,
            3333
          ]
        }
      }
    }
  },
  "size": 1,
  "track_total_hits": true,
  "aggs": {
    "per_code": {
      "terms": {
        "field": "code",
        "size": 200
      },
      "aggs": {
        "num": {
          "sum": {
            "field": "num"
          }
        }
      }
    },
    "sum_num": {
      "sum": {
        "field": "num"
      }
    }
  }
}