-
Notifications
You must be signed in to change notification settings - Fork 9.1k
HDFS-17352. Add configuration to control whether DN delete this replica from disk when client requests a missing block #6559
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
6cedc3a
9297f23
c9d6c88
44dd0ca
cada170
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,6 +129,9 @@ | |
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_DELETE_CORRUPT_REPLICA_FROM_DISK_DEFAULT; | ||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_DELETE_CORRUPT_REPLICA_FROM_DISK_ENABLE; | ||
|
||
/************************************************** | ||
* FSDataset manages a set of data blocks. Each block | ||
* has a unique name and an extent on disk. | ||
|
@@ -287,6 +290,7 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b) | |
private long curDirScannerNotifyCount; | ||
private long lastDirScannerNotifyTime; | ||
private volatile long lastDirScannerFinishTime; | ||
private volatile boolean deleteCorruptReplicaFromDisk; | ||
|
||
/** | ||
* An FSDataset has a directory where it loads its data files. | ||
|
@@ -392,6 +396,9 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b) | |
DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_MAX_NOTIFY_COUNT_KEY, | ||
DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_MAX_NOTIFY_COUNT_DEFAULT); | ||
lastDirScannerNotifyTime = System.currentTimeMillis(); | ||
deleteCorruptReplicaFromDisk = conf.getBoolean( | ||
DFS_DATANODE_DELETE_CORRUPT_REPLICA_FROM_DISK_ENABLE, | ||
DFS_DATANODE_DELETE_CORRUPT_REPLICA_FROM_DISK_DEFAULT); | ||
} | ||
|
||
/** | ||
|
@@ -2400,37 +2407,43 @@ public void invalidate(String bpid, ReplicaInfo block) { | |
} | ||
/** | ||
* Invalidate a block which is not found on disk. We should remove it from | ||
* memory and notify namenode, but unnecessary to delete the actual on-disk | ||
* block file again. | ||
* memory and notify namenode, will decide whether to delete the actual on-disk block and meta | ||
* file based on {@link DFSConfigKeys#DFS_DATANODE_DELETE_CORRUPT_REPLICA_FROM_DISK_ENABLE}. | ||
* | ||
* @param bpid the block pool ID. | ||
* @param block The block to be invalidated. | ||
* @param checkFiles Whether to check data and meta files. | ||
*/ | ||
public void invalidateMissingBlock(String bpid, Block block, boolean checkFiles) { | ||
public void invalidateMissingBlock(String bpid, Block block, boolean checkFiles) | ||
throws IOException { | ||
|
||
// The replica seems is on its volume map but not on disk. | ||
// We can't confirm here is block file lost or disk failed. | ||
// If block lost: | ||
// deleted local block file is completely unnecessary | ||
// If disk failed: | ||
// deleted local block file here may lead to missing-block | ||
// when it with only 1 replication left now. | ||
// So remove if from volume map notify namenode is ok. | ||
// If checkFiles is true, the existence of the block and metafile will be checked again. | ||
// If deleteCorruptReplicaFromDisk is true, delete the existing block or metafile directly, | ||
// otherwise just remove them from the memory volumeMap. | ||
try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.BLOCK_POOl, | ||
bpid)) { | ||
// Check if this block is on the volume map. | ||
ReplicaInfo replica = volumeMap.get(bpid, block); | ||
// Double-check block or meta file existence when checkFiles as true. | ||
if (replica != null && (!checkFiles || | ||
(!replica.blockDataExists() || !replica.metadataExists()))) { | ||
volumeMap.remove(bpid, block); | ||
invalidate(bpid, replica); | ||
if (deleteCorruptReplicaFromDisk) { | ||
ExtendedBlock extendedBlock = new ExtendedBlock(bpid, block); | ||
datanode | ||
.notifyNamenodeDeletedBlock(extendedBlock, replica.getStorageUuid()); | ||
invalidate(bpid, new Block[] {extendedBlock.getLocalBlock()}); | ||
} else { | ||
// For detailed info, please refer to HDFS-16985. | ||
volumeMap.remove(bpid, block); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add some comments to describe the necessity of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modify patch based on comments. Hi @ZanderXu please help review it again, thanks~ |
||
invalidate(bpid, replica); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public void invalidateMissingBlock(String bpid, Block block) { | ||
public void invalidateMissingBlock(String bpid, Block block) throws IOException { | ||
invalidateMissingBlock(bpid, block, true); | ||
} | ||
|
||
|
@@ -3845,5 +3858,15 @@ public void setLastDirScannerFinishTime(long time) { | |
public long getPendingAsyncDeletions() { | ||
return asyncDiskService.countPendingDeletions(); | ||
} | ||
|
||
@Override | ||
public void setDeleteCorruptReplicaFromDisk(boolean deleteCorruptReplicaFromDisk) { | ||
this.deleteCorruptReplicaFromDisk = deleteCorruptReplicaFromDisk; | ||
} | ||
|
||
@Override | ||
public boolean isDeleteCorruptReplicaFromDisk() { | ||
return deleteCorruptReplicaFromDisk; | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3982,6 +3982,17 @@ | |
</description> | ||
</property> | ||
|
||
<property> | ||
<name>dfs.datanode.delete.corrupt.replica.from.disk.enable</name> | ||
<value>true</value> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the default value is true, there is a risk of block missing according to HDFS-16985. I suggest setting the default value to false, as block missing is a more serious problem than disk file deletion delay. What's your opion? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @zhangshuyan0 for your comment. From the DataNode point of view, if already confirmed the meta file or data file is lost. it should be deleted directly from the memory and disk and this is expected behavior. For HDFS-16985 mentioned, if the current cluster deployment adopts the AWS EC2 + EBS solution, can adjust So I think it might be better from datanode perspective default to set looking forward to your suggestions again. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @zhangshuyan0 Would you mind look it again, thanks~ |
||
<description> | ||
Whether the datanode delete replica from disk when client requests a missing block, | ||
If true will delete the actual on-disk block and meta file, | ||
otherwise will only remove it from volume map and notify namenode. | ||
The default value is true. | ||
</description> | ||
</property> | ||
|
||
<property> | ||
<name>dfs.webhdfs.rest-csrf.enabled</name> | ||
<value>false</value> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be handy if this could be configured dynamically.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @tomscut for your review.
I wll support dynamically configured later.