数据导入直接使用 ClickHouse 官方提供的测试数据
https://clickhouse.com/docs/zh/getting-started/example-datasets/opensky,此数据集中的数据是从完整的 OpenSky 数据集中派生和清理而来的,以说明 COVID-19 新冠肺炎大流行期间空中交通的发展情况 。它涵盖了自2019年1月1日以来该网络超过2500名成员看到的所有航班,总数据量有6600w 。
# 在服务器 /home/flightlist 目录执行以下命令,该目录会被挂载到 mysql-pod、influxdb-pod、clickhouse-pod 内$ wget -O- https://zenodo.org/record/5092942 | grep -oP 'https://zenodo.org/record/5092942/files/flightlist_d+_d+.csv.gz' | xargs wget# 批量解压 flightlist.gz 数据$ for file in flightlist_*.csv.gz; do gzip -d "$file"; done# 将 csv 处理成 influxdb 导入所需的 txt 格式(此过程大概耗时1小时)$ Python/ target=_blank class=infotextkey>Python clickhouse-test-1.0.0/influxdb_csv2txt.py
MySQL# 进入 mysql pod$ kubectl exec -it [influxdb-podname] -- bash# 连上 mysql 建库、建表$ mysql -uroot -p123456$ use test;$ CREATE TABLE `opensky` (`callsign` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`number` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`icao24` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`registration` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`typecode` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`origin` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`destination` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,`firstseen` datetime DEFAULT NULL,`lastseen` datetime DEFAULT NULL,`day` datetime DEFAULT NULL,`latitude_1` double DEFAULT NULL,`longitude_1` double DEFAULT NULL,`altitude_1` double DEFAULT NULL,`latitude_2` double DEFAULT NULL,`longitude_2` double DEFAULT NULL,`altitude_2` double DEFAULT NULL,KEY `idx_callsign` (`callsign`),KEY `idx_origin` (`origin`),KEY `idx_destination` (`destination`)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;# 导入数据(大概耗时70分钟)$ load data local infile 'flightlist_20190101_20190131.csv' into table opensky character set utf8mb4 fields terminated by ',' lines terminated by 'n' ignore 1 lines;# 省略其他29条导入命令:load data local infile 'flightlist_*_*.csv' into table opensky character set utf8mb4 fields terminated by ',' lines terminated by 'n' ignore 1 lines;# 检查数据是否导入成功$ select count(*) from test.opensky;
InfluxDB# 进入 influxdb pod$ kubectl exec -it [influxdb-podname] -- bash# 导入数据(大概耗时30分钟)$ influx -username 'admin' -password 'admin123456' -import -path=/tmp/flightlist/flightlist_20190101_20190131.txt -precision=ns;# 省略其他29条导入命令:influx -username 'admin' -password 'admin123456' -import -path=/tmp/flightlist/flightlist_*_*.txt -precision=ns;# 检查数据是否导入成功$ influx -username 'admin' -password 'admin123456'$ select count(latitude_1) from test.autogen.opensky;
ClickHouse# 进入 clickhouse pod$ kubectl exec -it [clickhouse-podname] -- bash# 连上 clickhouse 建库、建表$ clickhouse-client$ create database test;$ use test;$ CREATE TABLE opensky(callsign String,number String,icao24 String,registration String,typecode String,origin String,destination String,firstseen DateTime,lastseen DateTime,day DateTime,latitude_1 Float64,longitude_1 Float64,altitude_1 Float64,latitude_2 Float64,longitude_2 Float64,altitude_2 Float64) ENGINE = MergeTree ORDER BY (origin, destination, callsign);$ exit# 导入数据(大概耗时75秒)$ cd /tmp/flightlist$ for file in flightlist_*.csv; do cat "$file" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO test.opensky FORMAT CSVWithNames"; done# 检查数据是否导入成功$ clickhouse-client$ SELECT count() FROM test.opensky;
测试场景MySQL$ mysql -uroot -p123456$ use test;-- 开启性能分析set profiling = 1;-- 查询磁盘空间select table_rows as `总行数`, (data_length + index_length)/1024/1024/1024 as `磁盘占用(G)` from information_schema.`TABLES` where table_name = 'opensky';-- 全表countselect count(latitude_1) from opensky;-- 全表max/minselect max(longitude_1),min(altitude_1) from opensky;-- 全表平均值select avg(latitude_2) from opensky;-- 全表方差select var_pop(longitude_2) from opensky;-- 复杂查询1:全表多个字段聚合查询select count(latitude_1),max(longitude_1),min(altitude_1),avg(latitude_2) from opensky;-- 复杂查询2:从莫斯科三个主要机场起飞的航班数量SELECT origin, count(1) AS c FROM opensky WHERE origin IN ('UUEE', 'UUDD', 'UUWW') GROUP BY origin;-- 输出分析结果show profiles;
InfluxDB$ influx -username 'admin' -password 'admin123456'$ use test;-- 耗时统计,queryReqDurationNs 是累计查询时间,2次任务的时间相减就是耗时select queryReq,queryReqDurationNs/1000/1000,queryRespBytes from _internal."monitor".httpd order by time desc limit 10;-- 查询磁盘空间select sum(diskBytes) / 1024 / 1024 /1024 from _internal."monitor"."shard" where time > now() - 10s group by "database";-- 全表countselect count(latitude_1) from opensky;-- 全表max/minselect max(longitude_1),min(altitude_1) from opensky;-- 全表平均值select mean(latitude_2) from opensky;-- 全表方差select stddev(longitude_2) from opensky;-- 复杂查询1:全表多个字段聚合查询select count(latitude_1),max(longitude_1),min(altitude_1),mean(latitude_2) from opensky;-- 复杂查询2:从莫斯科三个主要机场起飞的航班数量SELECT count(latitude_1) AS c FROM opensky WHERE origin =~/^UUEE|UUDD|UUWW$/ GROUP BY origin;
推荐阅读
- 四六级准考证号忘记了怎么查成绩? 英语四级忘记准考证
- 华为|269元 李小龙力荐华为S-Tag运动传感器:详细记录跑步数据
- 初中生优秀作文:那一次,我真的感动了 感动的作文400
- 微信用户记得关闭这6个设置,不然个人隐私易泄露了,望周知
- iPhone长篇笔记文章,怎么截成超长图片分享给别人?
- 九号、雅迪、爱玛、台铃电动车怎么选?都有什么优缺点?一次性分析明白
- 宫颈多久检查一次
- 一次minerd肉鸡木马的排查思路
- 月经一次比一次推迟
- 互联网|你还记得互联网没有“网暴”的日子吗?