我有一个作为作业运行的 fio 工作负载。
FIO 工作量:
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: fio-target
spec:
#storageClassName: ocs-storagecluster-cephfs
storageClassName: ocs-storagecluster-ceph-rbd
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
---
kind: ConfigMap
apiVersion: v1
metadata:
name: fio-config
data:
workload.fio: |
[simple-write]
readwrite=write
buffered=1
blocksize=4k
ioengine=libaio
directory=/target
#size=10G
fill_fs=1
---
apiVersion: batch/v1
kind: Job
metadata:
name: fio
spec:
template:
metadata:
name: fio
spec:
containers:
- name: fio
image: quay.io/johnstrunk/fs-performance:latest
command:
- "/usr/bin/fio"
- "--output-format=json"
- "/etc/fio/workload.fio"
volumeMounts:
- name: target
mountPath: /target
- name: fio-config-volume
mountPath: /etc/fio
restartPolicy: Always
volumes:
- name: target
persistentVolumeClaim:
claimName: fio-target
- name: fio-config-volume
configMap:
name: fio-config
fio工作量的输出:
fio: io_u error on file /target/simple-write.0.0: No space left on device: write offset=52638969856, buflen=4096
{
"fio version" : "fio-3.7",
"timestamp" : 1578468421,
"timestamp_ms" : 1578468421512,
"time" : "Wed Jan 8 07:27:01 2020",
"jobs" : [
{
"jobname" : "simple-write",
"groupid" : 0,
"error" : 0,
"eta" : 0,
"elapsed" : 149,
"job options" : {
"rw" : "write",
"buffered" : "1",
"bs" : "4k",
"ioengine" : "libaio",
"directory" : "/target",
"fill_device" : "1"
},
"read" : {
"io_bytes" : 0,
"io_kbytes" : 0,
"bw_bytes" : 0,
"bw" : 0,
"iops" : 0.000000,
"runtime" : 0,
"total_ios" : 0,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000
},
"clat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"percentile" : {
"1.000000" : 0,
"5.000000" : 0,
"10.000000" : 0,
"20.000000" : 0,
"30.000000" : 0,
"40.000000" : 0,
"50.000000" : 0,
"60.000000" : 0,
"70.000000" : 0,
"80.000000" : 0,
"90.000000" : 0,
"95.000000" : 0,
"99.000000" : 0,
"99.500000" : 0,
"99.900000" : 0,
"99.950000" : 0,
"99.990000" : 0
}
},
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000
},
"bw_min" : 0,
"bw_max" : 0,
"bw_agg" : 0.000000,
"bw_mean" : 0.000000,
"bw_dev" : 0.000000,
"bw_samples" : 0,
"iops_min" : 0,
"iops_max" : 0,
"iops_mean" : 0.000000,
"iops_stddev" : 0.000000,
"iops_samples" : 0
},
"write" : {
"io_bytes" : 52638969856,
"io_kbytes" : 51405244,
"bw_bytes" : 355971772,
"bw" : 347628,
"iops" : 86907.177732,
"runtime" : 147874,
"total_ios" : 12851312,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 2123,
"max" : 1877670923,
"mean" : 10619.164491,
"stddev" : 1295004.136689
},
"clat_ns" : {
"min" : 559,
"max" : 503883,
"mean" : 597.424963,
"stddev" : 334.725902,
"percentile" : {
"1.000000" : 572,
"5.000000" : 572,
"10.000000" : 572,
"20.000000" : 580,
"30.000000" : 580,
"40.000000" : 580,
"50.000000" : 580,
"60.000000" : 588,
"70.000000" : 588,
"80.000000" : 588,
"90.000000" : 596,
"95.000000" : 604,
"99.000000" : 780,
"99.500000" : 868,
"99.900000" : 1976,
"99.950000" : 7392,
"99.990000" : 11968
}
},
"lat_ns" : {
"min" : 2750,
"max" : 1877675884,
"mean" : 11260.353627,
"stddev" : 1295013.364993
},
"bw_min" : 8,
"bw_max" : 1046704,
"bw_agg" : 100.000000,
"bw_mean" : 369150.841727,
"bw_dev" : 237962.484144,
"bw_samples" : 278,
"iops_min" : 2,
"iops_max" : 261676,
"iops_mean" : 92287.737410,
"iops_stddev" : 59490.597358,
"iops_samples" : 278
},
"trim" : {
"io_bytes" : 0,
"io_kbytes" : 0,
"bw_bytes" : 0,
"bw" : 0,
"iops" : 0.000000,
"runtime" : 0,
"total_ios" : 0,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000
},
"clat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"percentile" : {
"1.000000" : 0,
"5.000000" : 0,
"10.000000" : 0,
"20.000000" : 0,
"30.000000" : 0,
"40.000000" : 0,
"50.000000" : 0,
"60.000000" : 0,
"70.000000" : 0,
"80.000000" : 0,
"90.000000" : 0,
"95.000000" : 0,
"99.000000" : 0,
"99.500000" : 0,
"99.900000" : 0,
"99.950000" : 0,
"99.990000" : 0
}
},
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000
},
"bw_min" : 0,
"bw_max" : 0,
"bw_agg" : 0.000000,
"bw_mean" : 0.000000,
"bw_dev" : 0.000000,
"bw_samples" : 0,
"iops_min" : 0,
"iops_max" : 0,
"iops_mean" : 0.000000,
"iops_stddev" : 0.000000,
"iops_samples" : 0
},
"sync" : {
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"percentile" : {
"1.000000" : 0,
"5.000000" : 0,
"10.000000" : 0,
"20.000000" : 0,
"30.000000" : 0,
"40.000000" : 0,
"50.000000" : 0,
"60.000000" : 0,
"70.000000" : 0,
"80.000000" : 0,
"90.000000" : 0,
"95.000000" : 0,
"99.000000" : 0,
"99.500000" : 0,
"99.900000" : 0,
"99.950000" : 0,
"99.990000" : 0
}
},
"total_ios" : 0
},
"usr_cpu" : 8.353114,
"sys_cpu" : 27.498597,
"ctx" : 9500,
"majf" : 0,
"minf" : 20,
"iodepth_level" : {
"1" : 100.000000,
"2" : 0.000000,
"4" : 0.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
">=64" : 0.000000
},
"latency_ns" : {
"2" : 0.000000,
"4" : 0.000000,
"10" : 0.000000,
"20" : 0.000000,
"50" : 0.000000,
"100" : 0.000000,
"250" : 0.000000,
"500" : 0.000000,
"750" : 98.191811,
"1000" : 1.513962
},
"latency_us" : {
"2" : 0.197637,
"4" : 0.019383,
"10" : 0.060476,
"20" : 0.012987,
"50" : 0.010000,
"100" : 0.010000,
"250" : 0.010000,
"500" : 0.000000,
"750" : 0.010000,
"1000" : 0.000000
},
"latency_ms" : {
"2" : 0.000000,
"4" : 0.000000,
"10" : 0.000000,
"20" : 0.000000,
"50" : 0.000000,
"100" : 0.000000,
"250" : 0.000000,
"500" : 0.000000,
"750" : 0.000000,
"1000" : 0.000000,
"2000" : 0.000000,
">=2000" : 0.000000
},
"latency_depth" : 1,
"latency_target" : 0,
"latency_percentile" : 100.000000,
"latency_window" : 0
}
],
"disk_util" : [
{
"name" : "rbd0",
"read_ios" : 35,
"write_ios" : 9010,
"read_merges" : 0,
"write_merges" : 9085,
"read_ticks" : 20979,
"write_ticks" : 23182697,
"in_queue" : 23199138,
"util" : 5.652565
}
]
}
我不明白为什么这份工作要求更多的空间?运行此作业后,我看不到“kubelet_volume_stats_used_bytes”指标下列出的任何 fio PVC。
看到集群上使用的存储空间从 0.4 GB 增加到 148 GB 也很奇怪。
据我所知,有人可以解释一下它应该只使用 50GB 吗?
嗯,这看起来像您在 fio 周围使用某种包装器(因为fio 本身不使用 YAML 作为其作业文件格式),这减少了可能愿意查看这个问题的人数......话虽如此感谢您,我们可以看到真正的工作参数是什么,包括 fio JSON 输出:-)。这是一个关键片段:
So you are setting
fill_device=1
(by settingfill_fs=1
which is an alias) which will try and grow a file until there's no space left and then start doing I/O on it. If that file isn't cleaned up I can see how future runs won't find any space available!As much as it can!
It asks for all the space because you told it to by setting
fill_fs=1
.I cannot see why it should be limited to that - was there something in your config that references 50GB?
PS: The
libaio
ioengine won't be asynchronous when you use buffered I/O...