В последние несколько дней я использовал данные tpcds для стресс-тестирования производительности записи Hudi 0.12.0. Я обнаружил, что через полчаса после записи данных произошло исключение, и программа продолжала перезапускаться. Журнал исключений задачи. заключается в следующем:
2023-01-06 18:36:21
org.apache.flink.util.FlinkException: Global failure triggered by OperatorCoordinator for 'stream_write: hudi_store_sales_3_100g' (operator 2cead6506500176d7c394ec289525472).
at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder$LazyInitializedCoordinatorContext.failJob(OperatorCoordinatorHolder.java:553)
at org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$start$0(StreamWriteOperatorCoordinator.java:187)
at org.apache.hudi.sink.utils.NonThrownExecutor.handleException(NonThrownExecutor.java:146)
at org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:133)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hudi.exception.HoodieException: Executor executes action [initialize instant ] error
... 6 more
Caused by: org.apache.hudi.exception.HoodieRollbackException: Failed to rollback hdfs://host117:8020/warehouse/tablespace/managed/hive/hudidbtest/hudi_store_sales_3_100g commits 20230106183555401
at org.apache.hudi.client.BaseHoodieWriteClient.rollback(BaseHoodieWriteClient.java:789)
at org.apache.hudi.client.BaseHoodieWriteClient.rollbackFailedWrites(BaseHoodieWriteClient.java:1198)
at org.apache.hudi.client.BaseHoodieWriteClient.rollbackFailedWrites(BaseHoodieWriteClient.java:1181)
at org.apache.hudi.client.BaseHoodieWriteClient.rollbackFailedWrites(BaseHoodieWriteClient.java:1169)
at org.apache.hudi.client.BaseHoodieWriteClient.lambda$startCommit$afea71c0$1(BaseHoodieWriteClient.java:942)
at org.apache.hudi.common.util.CleanerUtils.rollbackFailedWrites(CleanerUtils.java:151)
at org.apache.hudi.client.BaseHoodieWriteClient.startCommit(BaseHoodieWriteClient.java:941)
at org.apache.hudi.sink.StreamWriteOperatorCoordinator.startInstant(StreamWriteOperatorCoordinator.java:374)
at org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$initInstant$6(StreamWriteOperatorCoordinator.java:402)
at org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:130)
... 3 more
Caused by: org.apache.hudi.exception.HoodieException: org.apache.hudi.exception.HoodieException: Error occurs when executing flatMap
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at java.util.concurrent.ForkJoinTask.getThrowableException(ForkJoinTask.java:593)
at java.util.concurrent.ForkJoinTask.reportException(ForkJoinTask.java:677)
at java.util.concurrent.ForkJoinTask.invoke(ForkJoinTask.java:735)
at java.util.stream.ReduceOps$ReduceOp.evaluateParallel(ReduceOps.java:714)
at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:233)
at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499)
at org.apache.hudi.client.common.HoodieFlinkEngineContext.flatMap(HoodieFlinkEngineContext.java:136)
at org.apache.hudi.table.action.rollback.BaseRollbackHelper.maybeDeleteAndCollectStats(BaseRollbackHelper.java:114)
at org.apache.hudi.table.action.rollback.BaseRollbackHelper.performRollback(BaseRollbackHelper.java:81)
at org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.executeRollback(BaseRollbackActionExecutor.java:234)
at org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor.executeRollback(MergeOnReadRollbackActionExecutor.java:89)
at org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.doRollbackAndGetStats(BaseRollbackActionExecutor.java:216)
at org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.runRollback(BaseRollbackActionExecutor.java:110)
at org.apache.hudi.table.action.rollback.BaseRollbackActionExecutor.execute(BaseRollbackActionExecutor.java:137)
at org.apache.hudi.table.HoodieFlinkMergeOnReadTable.rollback(HoodieFlinkMergeOnReadTable.java:132)
at org.apache.hudi.client.BaseHoodieWriteClient.rollback(BaseHoodieWriteClient.java:772)
... 12 more
Caused by: org.apache.hudi.exception.HoodieException: Error occurs when executing flatMap
at org.apache.hudi.common.function.FunctionWrapper.lambda$throwingFlatMapWrapper$1(FunctionWrapper.java:50)
at java.util.stream.ReferencePipeline$7$1.accept(ReferencePipeline.java:267)
at java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1382)
at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)
at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)
at java.util.stream.ReduceOps$ReduceTask.doLeaf(ReduceOps.java:747)
at java.util.stream.ReduceOps$ReduceTask.doLeaf(ReduceOps.java:721)
at java.util.stream.AbstractTask.compute(AbstractTask.java:316)
at java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:731)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)
Caused by: org.apache.hudi.exception.HoodieIOException: Error appending rollback block
at org.apache.hudi.table.action.rollback.BaseRollbackHelper.lambda$maybeDeleteAndCollectStats$309309f3$1(BaseRollbackHelper.java:148)
at org.apache.hudi.common.function.FunctionWrapper.lambda$throwingFlatMapWrapper$1(FunctionWrapper.java:48)
... 12 more
Caused by: java.io.IOException: Failed to replace a bad datanode on the existing pipeline due to no more good datanodes being available to try. (Nodes: current=[DatanodeInfoWithStorage[10.45.46.117:1019,DS-5a3355b0-c815-4687-ab9a-2c6492e72e23,DISK], DatanodeInfoWithStorage[10.45.46.120:1019,DS-ef4d896d-b58e-4242-a077-2c53380d3442,DISK]], original=[DatanodeInfoWithStorage[10.45.46.120:1019,DS-ef4d896d-b58e-4242-a077-2c53380d3442,DISK], DatanodeInfoWithStorage[10.45.46.117:1019,DS-5a3355b0-c815-4687-ab9a-2c6492e72e23,DISK]]). The current failed datanode replacement policy is DEFAULT, and a client may configure this via 'dfs.client.block.write.replace-datanode-on-failure.policy' in its configuration.
at org.apache.hadoop.hdfs.DataStreamer.findNewDatanode(DataStreamer.java:1304)
at org.apache.hadoop.hdfs.DataStreamer.addDatanode2ExistingPipeline(DataStreamer.java:1372)
at org.apache.hadoop.hdfs.DataStreamer.handleDatanodeReplacement(DataStreamer.java:1598)
at org.apache.hadoop.hdfs.DataStreamer.setupPipelineInternal(DataStreamer.java:1499)
at org.apache.hadoop.hdfs.DataStreamer.setupPipelineForAppendOrRecovery(DataStreamer.java:1481)
at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:720)
На основании приведенного выше журнала исключений видно, что это вызвано проблемой узла данных.
Проверьте журналы двух основных узлов данных в приведенных выше журналах. В течение этого периода ошибки журнала следующие:
2023-01-06 18:36:02,561 ERROR datanode.DataNode (DataXceiver.java:writeBlock(863)) - DataNode{data=FSDataset{dirpath='[/hadoop/hdfs/data]'}, localName='host117:1019', datanodeUuid='4a2910a8-c600-4f03-aa8f-0eec0541cbef', xmitsInProgress=0}:Exception transfering block BP-105892518-10.45.46.117-1630557182227:blk_1079489557_7301204 to mirror 10.45.46.121:1019
java.io.EOFException: Unexpected EOF while trying to read response from server
at org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:549)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.writeBlock(DataXceiver.java:836)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opWriteBlock(Receiver.java:173)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:107)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:290)
at java.lang.Thread.run(Thread.java:748)
2023-01-06 18:36:02,561 INFO datanode.DataNode (DataXceiver.java:writeBlock(922)) - opWriteBlock BP-105892518-10.45.46.117-1630557182227:blk_1079489557_7301204 received exception java.io.EOFException: Unexpected EOF while trying to read response from server
2023-01-06 18:36:02,561 ERROR datanode.DataNode (DataXceiver.java:run(321)) - host117:1019:DataXceiver error processing WRITE_BLOCK operation src: /10.45.46.117:35202 dst: /10.45.46.117:1019
java.io.EOFException: Unexpected EOF while trying to read response from server
at org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:549)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.writeBlock(DataXceiver.java:836)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opWriteBlock(Receiver.java:173)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:107)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:290)
at java.lang.Thread.run(Thread.java:748)
2023-01-06 18:36:02,566 INFO DataNode.clienttrace (BlockReceiver.java:finalizeBlock(1533)) - src: /10.45.46.117:35194, dest: /10.45.46.117:1019, bytes: 105, op: HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_-290706940_34, offset: 0, srvID: 4a2910a8-c600-4f03-aa8f-0eec0541cbef, blockid: BP-105892518-10.45.46.117-1630557182227:blk_1079489556_7301203, duration(ns): 32677285
2023-01-06 18:36:02,566 INFO datanode.DataNode (BlockReceiver.java:run(1506)) - PacketResponder: BP-105892518-10.45.46.117-1630557182227:blk_1079489556_7301203, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=2:[10.45.46.121:1019, 10.45.46.120:1019] terminating
2023-01-06 18:36:02,587 INFO datanode.DataNode (DataXceiver.java:writeBlock(738)) - Receiving BP-105892518-10.45.46.117-1630557182227:blk_1079489558_7301211 src: /10.45.46.117:35226 dest: /10.45.46.117:1019
2023-01-06 18:36:02,590 INFO datanode.DataNode (BlockReceiver.java:receiveBlock(1010)) - Exception for BP-105892518-10.45.46.117-1630557182227:blk_1079489554_7301207
java.io.IOException: Premature EOF from inputStream
at org.apache.hadoop.io.IOUtils.readFully(IOUtils.java:212)
at org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doReadFully(PacketReceiver.java:211)
at org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doRead(PacketReceiver.java:134)
at org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.receiveNextPacket(PacketReceiver.java:109)
at org.apache.hadoop.hdfs.server.datanode.BlockReceiver.receivePacket(BlockReceiver.java:528)
at org.apache.hadoop.hdfs.server.datanode.BlockReceiver.receiveBlock(BlockReceiver.java:971)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.writeBlock(DataXceiver.java:891)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opWriteBlock(Receiver.java:173)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:107)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:290)
at java.lang.Thread.run(Thread.java:748)
2023-01-06 18:36:02,590 INFO datanode.DataNode (BlockReceiver.java:run(1470)) - PacketResponder: BP-105892518-10.45.46.117-1630557182227:blk_1079489554_7301207, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=2:[10.45.46.120:1019, 10.45.46.121:1019]: Thread is interrupted.
2023-01-06 18:36:02,590 INFO datanode.DataNode (BlockReceiver.java:run(1506)) - PacketResponder: BP-105892518-10.45.46.117-1630557182227:blk_1079489554_7301207, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=2:[10.45.46.120:1019, 10.45.46.121:1019] terminating
2023-01-06 18:36:02,590 INFO datanode.DataNode (DataXceiver.java:writeBlock(922)) - opWriteBlock BP-105892518-10.45.46.117-1630557182227:blk_1079489554_7301207 received exception java.io.IOException: Premature EOF from inputStream
2023-01-06 18:36:02,591 ERROR datanode.DataNode (DataXceiver.java:run(321)) - host117:1019:DataXceiver error processing WRITE_BLOCK operation src: /10.45.46.117:35204 dst: /10.45.46.117:1019
java.io.IOException: Premature EOF from inputStream
at org.apache.hadoop.io.IOUtils.readFully(IOUtils.java:212)
at org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doReadFully(PacketReceiver.java:211)
at org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doRead(PacketReceiver.java:134)
at org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.receiveNextPacket(PacketReceiver.java:109)
at org.apache.hadoop.hdfs.server.datanode.BlockReceiver.receivePacket(BlockReceiver.java:528)
at org.apache.hadoop.hdfs.server.datanode.BlockReceiver.receiveBlock(BlockReceiver.java:971)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.writeBlock(DataXceiver.java:891)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opWriteBlock(Receiver.java:173)
at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:107)
at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:290)
at java.lang.Thread.run(Thread.java:748)
ERROR datanode.DataNode (DataXceiver.java:run(321)) - host117:1019:DataXceiver error processing WRITE_BLOCK operation src: /10.45.46.117:35204 dst: /10.45.46.117:1019
java.io.IOException: Premature EOF from inputStream
ссылка:https://cloud.tencent.com/developer/article/1404118
Извлекая из журнала наиболее критическую информацию «Ошибка обработки DataXceiver операции WRITE_BLOCK» в сочетании с всесторонним анализом журнала, становится очевидным, что причиной сбоя узла данных является недостаточное количество исходящих потоков данных. Поэтому существует два метода оптимизации: 1. Увеличьте параметры дескриптора файла сервера Linux, на котором расположен узел данных. 2. Увеличьте параметры дескриптора узла данных HDFS: dfs.datanode.max.transfer.threads.
ссылка:https://cloud.tencent.com/developer/article/1404118
Увеличьте параметры файла Linux и увеличьте количество потоков передачи данных: dfs.datanode.max.transfer.threads
Эта статья является оригинальной статьей блоггера «xiaozhch5», работающего в области больших данных и искусственного интеллекта. Она соответствует соглашению об авторских правах CC 4.0 BY-SA. При перепечатке прикрепите ссылку на первоисточник и это заявление.
Исходная ссылка:https://cloud.tencent.com/developer/article/2206843