f2fs系列文章fsck(五)
fsck_verify通過前面的檢查結果來修正元資料。
首先是對nid的檢查情況進行檢視,f2fs_fsck中的nat_area_bitmap從開始的讀取f2fs_nat_block中的所有的f2fs_nat_entry來記錄所有有效的nid,但是在遍歷的過程中在呼叫sanity_check_nid的時候已經將所有正常的nid都給clear掉了,所以在檢查這個點陣圖的時候,如果發現還有些位是有效的,那麼證明有錯誤發生。然後是f2fs_fsck中記錄硬連結連結串列的hard_link_list_head,正常情況下應該是NULL,如果不是,說明也是出錯誤了。接著f2fs_fsck的main_area_bitmap記錄了在遍歷過程中所訪問到的所有的block,也就是記錄了所有的有效塊,所以這個理論上應該跟f2fs_fsck的sit_area_bitmap是一致的,所以不一致代表著錯誤的發生。還有就是f2fs_fsck的check_result的valid_blk_cnt、valid_node_cnt、valid_nat_entry_cnt、valid_inode_cnt、sit_free_segs也要跟發f2fs_checkpoint保持一致。然後是呼叫check_curseg_offset關於current segment進行檢查,next_blkoff對應的block必須是空閒的。對於LFS寫的,該segment剩下的block必須全部都是空閒的。另外在遍歷過程中對seg_entry的type也是進行了一定的修改,這裡也是要與原始的type進行比對。
for (i = 0; i < fsck->nr_nat_entries; i++) { if (f2fs_test_bit(i, fsck->nat_area_bitmap) != 0) { printf("NID[0x%x] is unreachable\n", i); nr_unref_nid++; } } if (fsck->hard_link_list_head != NULL) { node = fsck->hard_link_list_head; while (node) { printf("NID[0x%x] has [0x%x] more unreachable links\n", node->nid, node->links); node = node->next; } c.bug_on = 1; } printf("[FSCK] Unreachable nat entries "); if (nr_unref_nid == 0x0) { printf(" [Ok..] [0x%x]\n", nr_unref_nid); } else { printf(" [Fail] [0x%x]\n", nr_unref_nid); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] SIT valid block bitmap checking "); if (memcmp(fsck->sit_area_bitmap, fsck->main_area_bitmap, fsck->sit_area_bitmap_sz) == 0x0) { printf("[Ok..]\n"); } else { printf("[Fail]\n"); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] Hard link checking for regular file "); if (fsck->hard_link_list_head == NULL) { printf(" [Ok..] [0x%x]\n", fsck->chk.multi_hard_link_files); } else { printf(" [Fail] [0x%x]\n", fsck->chk.multi_hard_link_files); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] valid_block_count matching with CP "); if (sbi->total_valid_block_count == fsck->chk.valid_blk_cnt) { printf(" [Ok..] [0x%x]\n", (u32)fsck->chk.valid_blk_cnt); } else { printf(" [Fail] [0x%x]\n", (u32)fsck->chk.valid_blk_cnt); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] valid_node_count matcing with CP (de lookup) "); if (sbi->total_valid_node_count == fsck->chk.valid_node_cnt) { printf(" [Ok..] [0x%x]\n", fsck->chk.valid_node_cnt); } else { printf(" [Fail] [0x%x]\n", fsck->chk.valid_node_cnt); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] valid_node_count matcing with CP (nat lookup) "); if (sbi->total_valid_node_count == fsck->chk.valid_nat_entry_cnt) { printf(" [Ok..] [0x%x]\n", fsck->chk.valid_nat_entry_cnt); } else { printf(" [Fail] [0x%x]\n", fsck->chk.valid_nat_entry_cnt); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] valid_inode_count matched with CP "); if (sbi->total_valid_inode_count == fsck->chk.valid_inode_cnt) { printf(" [Ok..] [0x%x]\n", fsck->chk.valid_inode_cnt); } else { printf(" [Fail] [0x%x]\n", fsck->chk.valid_inode_cnt); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] free segment_count matched with CP "); if (le32_to_cpu(F2FS_CKPT(sbi)->free_segment_count) == fsck->chk.sit_free_segs) { printf(" [Ok..] [0x%x]\n", fsck->chk.sit_free_segs); } else { printf(" [Fail] [0x%x]\n", fsck->chk.sit_free_segs); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] next block offset is free "); if (check_curseg_offset(sbi) == 0) { printf(" [Ok..]\n"); } else { printf(" [Fail]\n"); ret = EXIT_ERR_CODE; c.bug_on = 1; } printf("[FSCK] fixing SIT types\n"); if (check_sit_types(sbi) != 0) force = 1; printf("[FSCK] other corrupted bugs "); if (c.bug_on == 0) { printf(" [Ok..]\n"); } else { printf(" [Fail]\n"); ret = EXIT_ERR_CODE; }
以上只是對這些資料的一致性問題進行了檢查和列印。下面開始真正的修復工作。硬連結的問題由fix_hard_links來完成,nat的問題由fix_nat_entries來完成,sit的問題是由函式rewrite_sit_area_bitmap解決。move_curseg_info、write_curseg_info、flush_curseg_sit_entries共同完成current segment的問題,最後fix_checkpoint完成上述的統計資料到f2fs_checkpoint的修復工作。
if (force || (c.fix_on && !c.ro)) { struct f2fs_checkpoint *cp = F2FS_CKPT(sbi); if (force || c.bug_on) { fix_hard_links(sbi); fix_nat_entries(sbi); rewrite_sit_area_bitmap(sbi); if (check_curseg_offset(sbi)) { move_curseg_info(sbi, SM_I(sbi)->main_blkaddr); write_curseg_info(sbi); flush_curseg_sit_entries(sbi); } fix_checkpoint(sbi); } else if (is_set_ckpt_flags(cp, CP_FSCK_FLAG)) { write_checkpoint(sbi); } }
fix_hard_links:如果f2fs_fsck的硬連結連結串列hard_link_list_head是NULL,那就直接返回,否則遍歷這個連結串列的節點,對每個節點的ino進行基本的sanity_check_nid檢查,然後將對應的f2fs_inode的連結數修復為記錄在連結串列節點中的實際的連結數actual_links。最後將修改之後的f2fs_inode寫回。
static void fix_hard_links(struct f2fs_sb_info *sbi)
{
struct f2fs_fsck *fsck = F2FS_FSCK(sbi);
struct hard_link_node *tmp, *node;
struct f2fs_node *node_blk = NULL;
struct node_info ni;
int ret;
if (fsck->hard_link_list_head == NULL)
return;
node_blk = (struct f2fs_node *)calloc(BLOCK_SZ, 1);
ASSERT(node_blk != NULL);
node = fsck->hard_link_list_head;
while (node) {
if (sanity_check_nid(sbi, node->nid, node_blk, F2FS_FT_MAX, TYPE_INODE, &ni))
FIX_MSG("Failed to fix, rerun fsck.f2fs");
node_blk->i.i_links = cpu_to_le32(node->actual_links);
FIX_MSG("File: 0x%x i_links= 0x%x -> 0x%x", node->nid, node->links, node->actual_links);
ret = dev_write_block(node_blk, ni.blk_addr);
ASSERT(ret >= 0);
tmp = node;
node = node->next;
free(tmp);
}
free(node_blk);
}
fix_nat_entries:前面提過,執行到f2fs_verify中,f2fs_fsck中的nat_area_bitmap正常情況下應該是將所有正常的nid的bit全部clear掉了。所以剩下的置了位所對應的nid都應該是無效的。這個函式就是完成這個功能,它逐位檢查f2fs_fsck中的nat_area_bitmap,發現置位了的,就呼叫nullify_nat_entry來將對應的nid無效掉。
static void fix_nat_entries(struct f2fs_sb_info *sbi)
{
struct f2fs_fsck *fsck = F2FS_FSCK(sbi);
u32 i;
for (i = 0; i < fsck->nr_nat_entries; i++)
if (f2fs_test_bit(i, fsck->nat_area_bitmap) != 0)
nullify_nat_entry(sbi, i);
}
nullify_nat_entry:這個函式完成將特定的nid無效掉,這個需要將記錄最新的nid對應的nat清空就行。記錄最新nat可能存在兩個地方,一個是在current segment的nat_journal中,還有一個就是記錄在裝置上的f2fs_nat_entry。所以nullify_nat_entry首先在nat_journal中查詢相應的nid,如果找到了就將相應的nat_journal的f2fs_nat_entry清空。否則需要讀取對應的f2fs_nat_block,找到nid的f2fs_nat_entry,將其清空並寫回。
void nullify_nat_entry(struct f2fs_sb_info *sbi, u32 nid)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_journal *journal = &curseg->sum_blk->journal;
struct f2fs_nat_block *nat_block;
pgoff_t block_addr;
int entry_off;
int ret;
int i = 0;
for (i = 0; i < nats_in_cursum(journal); i++) {
if (le32_to_cpu(nid_in_journal(journal, i)) == nid) {
memset(&nat_in_journal(journal, i), 0, sizeof(struct f2fs_nat_entry));
FIX_MSG("Remove nid [0x%x] in nat journal", nid);
return;
}
}
nat_block = (struct f2fs_nat_block *)calloc(BLOCK_SZ, 1);
ASSERT(nat_block);
entry_off = nid % NAT_ENTRY_PER_BLOCK;
block_addr = current_nat_addr(sbi, nid);
ret = dev_read_block(nat_block, block_addr);
ASSERT(ret >= 0);
if (nid == F2FS_NODE_INO(sbi) || nid == F2FS_META_INO(sbi)) {
FIX_MSG("nid [0x%x] block_addr= 0x%x -> 0x1", nid,
le32_to_cpu(nat_block->entries[entry_off].block_addr));
nat_block->entries[entry_off].block_addr = cpu_to_le32(0x1);
} else {
memset(&nat_block->entries[entry_off], 0, sizeof(struct f2fs_nat_entry));
FIX_MSG("Remove nid [0x%x] in NAT", nid);
}
ret = dev_write_block(nat_block, block_addr);
ASSERT(ret >= 0);
free(nat_block);
}
rewrite_sit_area_bitmap:這個函式主要完成f2fs_fsck中記錄遍歷過程中的真實有效塊的點陣圖main_area_bitmap到f2fs_sit_entry的同步。首先遍歷所有的segno,將segno對應的f2fs_sit_block讀取進來,然後找到相應的f2fs_sit_entry,然後用main_area_bitmap中segno對應的位置的點陣圖替代f2fs_sit_entry中的點陣圖,然後根據這個點陣圖更新其中的有效塊數,還有就是將更新後的seg_entry中的segment的type也同步到f2fs_sit_entry,最後將修復後的f2fs_sit_entry寫回。
void rewrite_sit_area_bitmap(struct f2fs_sb_info *sbi)
{
struct f2fs_fsck *fsck = F2FS_FSCK(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno = 0;
struct f2fs_summary_block *sum = curseg->sum_blk;
char *ptr = NULL;
sum->journal.n_sits = 0;
ptr = fsck->main_area_bitmap;
for (segno = 0; segno < TOTAL_SEGS(sbi); segno++) {
struct f2fs_sit_block *sit_blk;
struct f2fs_sit_entry *sit;
struct seg_entry *se;
u16 valid_blocks = 0;
u16 type;
int i;
sit_blk = get_current_sit_page(sbi, segno);
sit = &sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, segno)];
memcpy(sit->valid_map, ptr, SIT_VBLOCK_MAP_SIZE);
for (i = 0; i < SIT_VBLOCK_MAP_SIZE; i++)
valid_blocks += get_bits_in_byte(sit->valid_map[i]);
se = get_seg_entry(sbi, segno);
memcpy(se->cur_valid_map, ptr, SIT_VBLOCK_MAP_SIZE);
se->valid_blocks = valid_blocks;
type = se->type;
if (type >= NO_CHECK_TYPE) {
ASSERT_MSG("Invalide type and valid blocks=%x,%x", segno, valid_blocks);
type = 0;
}
sit->vblocks = cpu_to_le16((type << SIT_VBLOCKS_SHIFT) | valid_blocks);
rewrite_current_sit_page(sbi, segno, sit_blk);
free(sit_blk);
ptr += SIT_VBLOCK_MAP_SIZE;
}
}
之前提到過,函式check_curseg_offset檢查current segment是否出現了問題,這裡也是通過這個函式來檢查是不是除了問題,有問題就通過進行修復。check_curseg_offset主要檢查next_blkoff對應的block必須是空閒的。對於LFS寫的,該segment剩下的block必須全部都是空閒的。
int check_curseg_offset(struct f2fs_sb_info *sbi)
{
int i;
for (i = 0; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
struct seg_entry *se;
int j, nblocks;
if ((curseg->next_blkoff >> 3) >= SIT_VBLOCK_MAP_SIZE)
return -EINVAL;
se = get_seg_entry(sbi, curseg->segno);
if (f2fs_test_bit(curseg->next_blkoff, (const char *)se->cur_valid_map)) {
ASSERT_MSG("Next block offset is not free, type:%d", i);
return -EINVAL;
}
if (curseg->alloc_type == SSR)
return 0;
nblocks = sbi->blocks_per_seg;
for (j = curseg->next_blkoff + 1; j < nblocks; j++) {
if (f2fs_test_bit(j, (const char *)se->cur_valid_map)) {
ASSERT_MSG("LFS must have free section:%d", i);
return -EINVAL;
}
}
}
return 0;
}
move_curseg_info:對NO_CHECK_TYPE種current segment進行遍歷,然後是呼叫函式find_next_free_block在main area中找到相應的seg_entry與遍歷的current segment有著相同型別的segment中的空閒塊或者整個segment空閒的起始塊,然後返回其segno,將這個segno替換該型別對應的current segment,然後修改current segment中的欄位segno、next_blkoff、alloc_type改為SSR(洞寫)、sum_blk。然後呼叫函式reset_curseg根據current segment的type來設定curseg_info中sum_blk中的summary_footer的型別,由於剛才找空閒塊的時候如果是空閒segment,那麼這個segment的type可能跟需要查詢的型別是不對應的,所以reset_curseg也完成對seg_entry的型別的修改。
void move_curseg_info(struct f2fs_sb_info *sbi, u64 from)
{
int i, ret;
for (i = 0; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
struct f2fs_summary_block buf;
u32 old_segno;
u64 ssa_blk, to;
ssa_blk = GET_SUM_BLKADDR(sbi, curseg->segno);
ret = dev_write_block(curseg->sum_blk, ssa_blk);
ASSERT(ret >= 0);
to = from;
ret = find_next_free_block(sbi, &to, 0, i);
ASSERT(ret == 0);
old_segno = curseg->segno;
curseg->segno = GET_SEGNO(sbi, to);
curseg->next_blkoff = OFFSET_IN_SEG(sbi, to);
curseg->alloc_type = SSR;
ssa_blk = GET_SUM_BLKADDR(sbi, curseg->segno);
ret = dev_read_block(&buf, ssa_blk);
ASSERT(ret >= 0);
memcpy(curseg->sum_blk, &buf, SUM_ENTRIES_SIZE);
reset_curseg(sbi, i);
DBG(1, "Move curseg[%d] %x -> %x after %"PRIx64"\n", i, old_segno, curseg->segno, from);
}
}
write_curseg_info:將修改後的curent segment的segno和blkoff修改到f2fs_checkpoint中的cur_data_segno(cur_node_segno)、cur_data_blkoff(cur_node_blkoff),還有分配的型別alloc_type也進行更新。
void write_curseg_info(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *cp = F2FS_CKPT(sbi);
int i;
for (i = 0; i < NO_CHECK_TYPE; i++) {
cp->alloc_type[i] = CURSEG_I(sbi, i)->alloc_type;
if (i < CURSEG_HOT_NODE) {
set_cp(cur_data_segno[i], CURSEG_I(sbi, i)->segno);
set_cp(cur_data_blkoff[i], CURSEG_I(sbi, i)->next_blkoff);
} else {
int n = i - CURSEG_HOT_NODE;
set_cp(cur_node_segno[n], CURSEG_I(sbi, i)->segno);
set_cp(cur_node_blkoff[n], CURSEG_I(sbi, i)->next_blkoff);
}
}
}
flush_curseg_sit_entries:之前的move_curseg_info呼叫函式reset_curseg的過程中可能對seg_entry進行了修改,這個函式將current的seg_entry同步到f2fs_sit_entry中寫回。
static void flush_curseg_sit_entries(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
int i;
for (i = 0; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
struct f2fs_sit_block *sit_blk;
struct f2fs_sit_entry *sit;
struct seg_entry *se;
se = get_seg_entry(sbi, curseg->segno);
sit_blk = get_current_sit_page(sbi, curseg->segno);
sit = &sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, curseg->segno)];
sit->vblocks = cpu_to_le16((se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks);
rewrite_current_sit_page(sbi, curseg->segno, sit_blk);
free(sit_blk);
}
}
fix_checkpoint:首先將f2fs_fsck中的check_result中的統計結果同步到f2fs_checkpoint中,這些資料包括ckpt_flags、free_segment_count、valid_block_count、valid_node_count、valid_inode_count。然後按照cp pack中的順序跳過orphan inode進行寫回。
static void fix_checkpoint(struct f2fs_sb_info *sbi)
{
struct f2fs_fsck *fsck = F2FS_FSCK(sbi);
struct f2fs_super_block *sb = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *cp = F2FS_CKPT(sbi);
unsigned long long cp_blk_no;
u32 flags = CP_UMOUNT_FLAG;
block_t orphan_blks = 0;
u32 i;
int ret;
u_int32_t crc = 0;
if (is_set_ckpt_flags(cp, CP_ORPHAN_PRESENT_FLAG)) {
orphan_blks = __start_sum_addr(sbi) - 1;
flags |= CP_ORPHAN_PRESENT_FLAG;
}
set_cp(cp_pack_total_block_count, 8 + orphan_blks + get_sb(cp_payload));
flags = update_nat_bits_flags(sb, cp, flags);
flags |= CP_NOCRC_RECOVERY_FLAG;
set_cp(ckpt_flags, flags);
set_cp(free_segment_count, get_free_segments(sbi));
set_cp(valid_block_count, fsck->chk.valid_blk_cnt);
set_cp(valid_node_count, fsck->chk.valid_node_cnt);
set_cp(valid_inode_count, fsck->chk.valid_inode_cnt);
crc = f2fs_cal_crc32(F2FS_SUPER_MAGIC, cp, CHECKSUM_OFFSET);
*((__le32 *)((unsigned char *)cp + CHECKSUM_OFFSET)) = cpu_to_le32(crc);
cp_blk_no = get_sb(cp_blkaddr);
if (sbi->cur_cp == 2)
cp_blk_no += 1 << get_sb(log_blocks_per_seg);
ret = dev_write_block(cp, cp_blk_no++);
ASSERT(ret >= 0);
for (i = 0; i < get_sb(cp_payload); i++) {
ret = dev_write_block(((unsigned char *)cp) + i * F2FS_BLKSIZE, cp_blk_no++);
ASSERT(ret >= 0);
}
cp_blk_no += orphan_blks;
for (i = 0; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
ret = dev_write_block(curseg->sum_blk, cp_blk_no++);
ASSERT(ret >= 0);
}
ret = dev_write_block(cp, cp_blk_no++);
ASSERT(ret >= 0);
if (flags & CP_NAT_BITS_FLAG)
write_nat_bits(sbi, sb, cp, sbi->cur_cp);
}