Now you need to quickly analyze a 2G CSV file;
Based on the acquired knowledge, use java to read files by line and import data into elasticsearch in batches,
Then use es powerful aggregation ability to analyze data, and finish it in an hour!
package com.example.demo;
import com.alibaba.fastjson.JSON;
import com.example.demo.entity.Entity;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.test.context.SpringBootTest;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
/**
*Read large files
*CSV format
*
* @author lhb
* @date 2021/11/11
* @since 1.0.0
*/
@SpringBootTest
public class ImportTest {
@Autowired
@Qualifier("client")
private RestHighLevelClient restHighLevelClient;
@Test
void insert() {
//CSV file 2G, 63w pieces of data, more than ten fields
String filePath = "D:\\file\211111.csv";
LineIterator it = null;
try {
it = FileUtils.lineIterator(new File(filePath), "UTF-8");
} catch (IOException e) {
e.printStackTrace();
}
try {
while (it.hasNext()) {
String line = it.nextLine();
//System.out.println("line = " + line);
//The file is a CSV file. Each column in the CSV file is separated by "," so that the elements of each column can be obtained
String[] strArray = line.split(",");
//There is a long space. Trim it
String name = strArray[6].trim();
String code = strArray[8].trim();
String num = strArray[11].trim();
System.out.println(code + "==" + num);
Entity entity = new Entity();
entity.setCode(code);
if (Objects.equals("xxx", code)) {
//Skip header
continue;
}
entity.setNum(Long.parseLong(num));
entity.setName(name);
entity.setCreateTime(new Date());
String index = "index20211111";
singleInsert2(index, entity);
}
} finally {
LineIterator.closeQuietly(it);
}
}
@Test
void batchInsert() {
String filePath = "D:\\express\211111.csv";
LineIterator it = null;
try {
it = FileUtils.lineIterator(new File(filePath), "UTF-8");
} catch (IOException e) {
e.printStackTrace();
}
try {
int i = 0;
List entities = new ArrayList<>();
while (it.hasNext()) {
String line = it.nextLine();
//System.out.println("line = " + line);
String[] strArray = line.split(",");
String code = strArray[6].trim();
String name = strArray[8].trim();
String num = strArray[11].trim();
System.out.println(code + "==" + num);
if (Objects.equals("xxx", code)) {
//Skip header
continue;
}
Entity entity = new Entity();
entity.setCode(code);
entity.setName(name);
try {
entity.setNum(Long.parseLong(num));
} catch (NumberFormatException e) {
e.printStackTrace();
System. out. Println ("wrong data" + code + "= =" + Num);
}
entity.setCreateTime(new Date());
String index = "index20211111";
//Batch insert
entities.add(entity);
i++; // if the last batch insertion is less than 10000 pieces of data, it needs to be processed according to the actual number of pieces
if (i % 10000 == 0) {
System.out.println("i = " + i);
try {
batchInsert2(index, entities);
} catch (IOException e) {
e.printStackTrace();
}
//Empty the processed list
entities.clear();
i = 0;
}
}
} finally {
LineIterator.closeQuietly(it);
}
}
/**
*Batch speed bar
*
* @param index
* @param entities
* @throws IOException
*/
public void batchInsert2(String index, List entities) throws IOException {
BulkRequest bulkRequest = new BulkRequest(index);
System.out.println("entities.sz = " + entities.size());
for (Entity org : entities) {
IndexRequest request = new IndexRequest();
request.source(JSON.toJSONString(org), XContentType.JSON);
bulkRequest.add(request);
}
restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
}
/**
*Large amount of data, super slow
*
* @param index
* @param entity
*/
public void singleInsert2(String index, Entity entity) {
IndexRequest request = new IndexRequest(index);
request.source(JSON.toJSONString(entity), XContentType.JSON);
try {
IndexResponse index1 = restHighLevelClient.index(request, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
}
}
Entity class, what fields need to be customized
package com.example.demo.entity;
import lombok.Data;
import java.util.Date;
/**
* @author lhb
* @date 2021/11/11
* @since 1.0.0
*/
@Data
public class Entity {
/**
*Code
*/
private String code;
/**
*Name
*/
private String name;
/**
*Quantity
*/
private Long num;
private Date createTime;
}
Create an index map and insert data:
PUT express_to_village20211104
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"code": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"num": {
"type": "long"
},
"createTime": {
"type": "date"
}
}
}
}
Start analyzing data:
GET index20211111/_count
{}
#Return 63w data
{
“count” : 630000,
“_shards” : {
“total” : 1,
“successful” : 1,
“skipped” : 0,
“failed” : 0
}
}
GET index20211111/_search
{
"query": {
"constant_score": {
"filter": {
"terms": {
"code": [
2222,
1111,
3333
]
}
}
}
},
"size": 1,
"track_total_hits": true,
"aggs": {
"per_code": {
"terms": {
"field": "code",
"size": 200
},
"aggs": {
"num": {
"sum": {
"field": "num"
}
}
}
},
"sum_num": {
"sum": {
"field": "num"
}
}
}
}