我正在尝试使用 KRaft 创建一个包含 3 个代理和 3 个控制器的集群。但是每次一个代理发生故障,而之前发生故障的代理又恢复运行。
这是我的 docker 配置:
version: "3.7"
services:
controller-1:
image: confluentinc/cp-kafka:latest
hostname: controller-1
container_name: controller-1
environment:
KAFKA_NODE_ID: 1
KAFKA_PROCESS_ROLES: controller
KAFKA_CONTROLLER_QUORUM_VOTERS: "1@controller-1:9093,2@controller-2:9093,3@controller-3:9093"
KAFKA_LISTENERS: CONTROLLER://0.0.0.0:9093
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT
KAFKA_LOG_DIRS: /var/lib/kafka/data
CLUSTER_ID: KixvqJ76Qn-xLPPrSfBQSw
KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER # Set your generated UUID her
volumes:
- controller_1_data:/var/lib/kafka/data
user: 1000:1000
networks:
- kafka-connector
restart: always
controller-2:
image: confluentinc/cp-kafka:latest
hostname: controller-2
container_name: controller-2
environment:
KAFKA_NODE_ID: 2
KAFKA_PROCESS_ROLES: controller
KAFKA_CONTROLLER_QUORUM_VOTERS: "1@controller-1:9093,2@controller-2:9093,3@controller-3:9093"
KAFKA_LISTENERS: CONTROLLER://0.0.0.0:9093
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT
KAFKA_LOG_DIRS: /var/lib/kafka/data
CLUSTER_ID: KixvqJ76Qn-xLPPrSfBQSw
KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
volumes:
- controller_2_data:/var/lib/kafka/data
networks:
- kafka-connector
restart: always
controller-3:
image: confluentinc/cp-kafka:latest
hostname: controller-3
container_name: controller-3
environment:
KAFKA_NODE_ID: 3
KAFKA_PROCESS_ROLES: controller
KAFKA_CONTROLLER_QUORUM_VOTERS: "1@controller-1:9093,2@controller-2:9093,3@controller-3:9093"
KAFKA_LISTENERS: CONTROLLER://0.0.0.0:9093
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT
KAFKA_LOG_DIRS: /var/lib/kafka/data
CLUSTER_ID: KixvqJ76Qn-xLPPrSfBQSw
KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
volumes:
- controller_3_data:/var/lib/kafka/data
networks:
- kafka-connector
restart: always
kafka-1:
image: confluentinc/cp-kafka:latest
hostname: kafka-1
container_name: kafka-1
ports:
- 29092:29092
- 9092:9092
environment:
KAFKA_BROKER_ID: 11
KAFKA_NODE_ID: 11
KAFKA_PROCESS_ROLES: broker
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://192.168.18.2:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT
KAFKA_CONTROLLER_QUORUM_VOTERS: "2@controller-2:9093,3@controller-3:9093"
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_LOG_DIRS: /var/lib/kafka/data
CLUSTER_ID: KixvqJ76Qn-xLPPrSfBQSw
KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 2
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
volumes:
- kafka_1_data:/var/lib/kafka/data
networks:
- kafka-connector
restart: always
kafka-2:
image: confluentinc/cp-kafka:latest
hostname: kafka-2
container_name: kafka-2
ports:
- 29093:29092
- 9094:9092
environment:
KAFKA_BROKER_ID: 12
KAFKA_NODE_ID: 12
KAFKA_PROCESS_ROLES: broker
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://192.168.18.2:9094
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT
KAFKA_CONTROLLER_QUORUM_VOTERS: "1@controller-1:9093,3@controller-3:9093"
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_LOG_DIRS: /var/lib/kafka/data
CLUSTER_ID: KixvqJ76Qn-xLPPrSfBQSw
KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 2
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
volumes:
- kafka_2_data:/var/lib/kafka/data
networks:
- kafka-connector
restart: always
kafka-3:
image: confluentinc/cp-kafka:latest
hostname: kafka-3
container_name: kafka-3
ports:
- 29094:29092
- 9096:9092
environment:
KAFKA_BROKER_ID: 13
KAFKA_NODE_ID: 13
KAFKA_PROCESS_ROLES: broker
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://192.168.18.2:9096
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT
KAFKA_CONTROLLER_QUORUM_VOTERS: "1@controller-1:9093,2@controller-2:9093"
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_LOG_DIRS: /var/lib/kafka/data
CLUSTER_ID: KixvqJ76Qn-xLPPrSfBQSw
KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 2
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
volumes:
- kafka_3_data:/var/lib/kafka/data
networks:
- kafka-connector
restart: always
debezium:
image: debezium/connect:2.4
depends_on:
- kafka-1
- kafka-2
- kafka-3
environment:
- BOOTSTRAP_SERVERS=kafka-1:29092,kafka-2:29093,kafka-3:29094
- GROUP_ID=1
- CONFIG_STORAGE_TOPIC=my_connect_configs
- OFFSET_STORAGE_TOPIC=my_connect_offsets
- STATUS_STORAGE_TOPIC=my_connect_statuses
ports:
- 8083:8083
networks:
- kafka-connector
restart: always
logging:
driver: "json-file"
options:
max-size: "1g"
max-file: "5"
kafdrop:
image: obsidiandynamics/kafdrop
depends_on:
- kafka-1
- kafka-2
- kafka-3
ports:
- 9000:9000
environment:
KAFKA_BROKER_CONNECT: kafka-1:9092,kafka-2:9094,kafka-3:9096
networks:
- kafka-connector
restart: always
logging:
driver: "json-file"
options:
max-size: "1g"
max-file: "5"
volumes:
controller_1_data: ~
controller_2_data: ~
controller_3_data: ~
kafka_1_data: ~
kafka_2_data: ~
kafka_3_data: ~
networks:
kafka-connector:
driver: bridge
目前不介意 Debezium。我只是试图创建集群并生成一些消息。
我从 broker-3 收到的错误是这样的:
[2024-10-18 11:39:12,347] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:12,420] INFO [RaftManager id=13] Registered the listener org.apache.kafka.image.loader.MetadataLoader@1574972196 (org.apache.kafka.raft.KafkaRaftClient)
[2024-10-18 11:39:12,455] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:12,559] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:12,695] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:12,801] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:12,901] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:13,002] INFO [MetadataLoader id=13] initializeNewPublishers: the loader is still catching up because we still don't know the high water mark yet. (org.apache.kafka.image.loader.MetadataLoader)
[2024-10-18 11:39:13,063] ERROR Encountered fatal fault: Unexpected error in raft IO thread (org.apache.kafka.server.fault.ProcessTerminatingFaultHandler)
java.lang.IllegalStateException: Cannot transition to Follower with leaderId=3 and epoch=80 since it is not one of the voters [1, 2]
at org.apache.kafka.raft.QuorumState.transitionToFollower(QuorumState.java:381)
at org.apache.kafka.raft.KafkaRaftClient.transitionToFollower(KafkaRaftClient.java:518)
at org.apache.kafka.raft.KafkaRaftClient.maybeTransition(KafkaRaftClient.java:1523)
at org.apache.kafka.raft.KafkaRaftClient.maybeHandleCommonResponse(KafkaRaftClient.java:1473)
at org.apache.kafka.raft.KafkaRaftClient.handleFetchResponse(KafkaRaftClient.java:1071)
at org.apache.kafka.raft.KafkaRaftClient.handleResponse(KafkaRaftClient.java:1550)
at org.apache.kafka.raft.KafkaRaftClient.handleInboundMessage(KafkaRaftClient.java:1676)
at org.apache.kafka.raft.KafkaRaftClient.poll(KafkaRaftClient.java:2251)
at kafka.raft.KafkaRaftManager$RaftIoThread.doWork(RaftManager.scala:64)
at org.apache.kafka.server.util.ShutdownableThread.run(ShutdownableThread.java:127)
如果 broker-2 关闭,那么唯一的区别就在于这部分:
[2024-10-18 11:43:54,404] ERROR Encountered fatal fault: Unexpected error in raft IO thread (org.apache.kafka.server.fault.ProcessTerminatingFaultHandler)
java.lang.IllegalStateException: Cannot transition to Follower with leaderId=2 and epoch=123 since it is not one of the voters [1, 3]