Docker儲存驅動之overlay新映象儲存的實現和inode耗盡問題
映象是按層下載和管理的,新映象下載的檔案臨時存放在/var/lib/docker/tmp,檔案命名方式是GetImageBlobxxx(xxx是一串隨機數字),這些臨時檔案時按層打包為tar.gz等壓縮包。臨時檔案首先被解壓為tar包存在快取中,然後使用docker\layer\layer_store.go的layerStore的Register函式註冊到系統中,最後臨時檔案被刪除。
在docker\distribution\pull_v2.go:
func (ld *v2LayerDescriptor) Download(ctx context.Context, progressOutput progress.Output) (io.ReadCloser, int64, error) {
...
return ioutils.NewReadCloserWrapper(tmpFile, func() error {
tmpFile.Close()
//關閉後刪除臨時檔案
err := os.RemoveAll(tmpFile.Name())
if err != nil {
logrus.Errorf("Failed to remove temp file: %s", tmpFile.Name())
}
return err
}), size, nil
}
所謂註冊就是將實際資料寫到檔案系統中。這個過程分三步:
第一,建立映象層的獨有目錄
第二,將解壓tar包資料寫到相應映象層的獨有目錄中
第三,在映象層管理Map中插入映象層物件
func (ls *layerStore) Register(ts io.Reader, parent ChainID) (Layer, error) {
logrus.Debugf("Register parent: %s", parent)
return ls.registerWithDescriptor(ts, parent, distribution.Descriptor{})
}
func (ls *layerStore) registerWithDescriptor(ts io.Reader, parent ChainID, descriptor distribution.Descriptor) (Layer, error) {
// err is used to hold the error which will always trigger
// cleanup of creates sources but may not be an error returned
// to the caller (already exists).
var err error
var pid string
var p *roLayer
if string(parent) != "" {
//這裡直接從Map中取
p = ls.get(parent)
if p == nil {
return nil, ErrLayerDoesNotExist
}
pid = p.cacheID
// Release parent chain if error
defer func() {
if err != nil {
ls.layerL.Lock()
ls.releaseLayer(p)
ls.layerL.Unlock()
}
}()
if p.depth() >= maxLayerDepth {
err = ErrMaxDepthExceeded
return nil, err
}
}
// Create new roLayer
layer := &roLayer{
parent: p,
cacheID: stringid.GenerateRandomID(),
referenceCount: 1,
layerStore: ls,
references: map[Layer]struct{}{},
descriptor: descriptor,
}
//如果parent為空,這裡pid為空,會建一個根目錄root
//如果parent的root存在,則做overlay
if err = ls.driver.Create(layer.cacheID, pid, "", nil); err != nil {
return nil, err
}
tx, err := ls.store.StartTransaction()
if err != nil {
return nil, err
}
defer func() {
if err != nil {
logrus.Debugf("Cleaning up layer %s: %v", layer.cacheID, err)
if err := ls.driver.Remove(layer.cacheID); err != nil {
logrus.Errorf("Error cleaning up cache layer %s: %v", layer.cacheID, err)
}
if err := tx.Cancel(); err != nil {
logrus.Errorf("Error canceling metadata transaction %q: %s", tx.String(), err)
}
}
}()
//應用tar包
if err = ls.applyTar(tx, ts, pid, layer); err != nil {
return nil, err
}
if layer.parent == nil {
layer.chainID = ChainID(layer.diffID)
} else {
layer.chainID = createChainIDFromParent(layer.parent.chainID, layer.diffID)
}
if err = storeLayer(tx, layer); err != nil {
return nil, err
}
ls.layerL.Lock()
defer ls.layerL.Unlock()
if existingLayer := ls.getWithoutLock(layer.chainID); existingLayer != nil {
// Set error for cleanup, but do not return the error
err = errors.New("layer already exists")
return existingLayer.getReference(), nil
}
if err = tx.Commit(layer.chainID); err != nil {
return nil, err
}
//註冊層
ls.layerMap[layer.chainID] = layer
return layer.getReference(), nil
}
這個過程的前兩步是依賴驅動來完成的,對於overlay驅動來說,有兩種情景:
第一,處理的映象層有父層
第二,處理的映象層沒有父層,也就是映象層是基礎映象層
我們先看下overlay驅動的初始化:
// Init returns the NaiveDiffDriver, a native diff driver for overlay filesystem.
// If overlay filesystem is not supported on the host, graphdriver.ErrNotSupported is returned as error.
// If an overlay filesystem is not supported over an existing filesystem then error graphdriver.ErrIncompatibleFS is returned.
func Init(home string, options []string, uidMaps, gidMaps []idtools.IDMap) (graphdriver.Driver, error) {
if err := supportsOverlay(); err != nil {
return nil, graphdriver.ErrNotSupported
}
fsMagic, err := graphdriver.GetFSMagic(home)
if err != nil {
return nil, err
}
if fsName, ok := graphdriver.FsNames[fsMagic]; ok {
backingFs = fsName
}
switch fsMagic {
case graphdriver.FsMagicAufs, graphdriver.FsMagicBtrfs, graphdriver.FsMagicOverlay, graphdriver.FsMagicZfs, graphdriver.FsMagicEcryptfs:
logrus.Errorf("'overlay' is not supported over %s", backingFs)
return nil, graphdriver.ErrIncompatibleFS
}
rootUID, rootGID, err := idtools.GetRootUIDGID(uidMaps, gidMaps)
if err != nil {
return nil, err
}
// Create the driver home dir
if err := idtools.MkdirAllAs(home, 0700, rootUID, rootGID); err != nil && !os.IsExist(err) {
return nil, err
}
if err := mount.MakePrivate(home); err != nil {
return nil, err
}
d := &Driver{
home: home,
uidMaps: uidMaps,
gidMaps: gidMaps,
ctr: graphdriver.NewRefCounter(graphdriver.NewFsChecker(graphdriver.FsMagicOverlay)),
}
return NaiveDiffDriverWithApply(d, uidMaps, gidMaps), nil
}
// NaiveDiffDriverWithApply returns a NaiveDiff driver with custom ApplyDiff.
func NaiveDiffDriverWithApply(driver ApplyDiffProtoDriver, uidMaps, gidMaps []idtools.IDMap) graphdriver.Driver {
return &naiveDiffDriverWithApply{
Driver: graphdriver.NewNaiveDiffDriver(driver, uidMaps, gidMaps),
applyDiff: driver,
}
}
可以看到返回的是naiveDiffDriverWithApply,而naiveDiffDriverWithApply包含兩個物件Driver和applyDiff。
根據go語言特性,第一步呼叫
if err = ls.driver.Create(layer.cacheID, pid, "", nil); err != nil {
return nil, err
}
呼叫的是檔案中docker\daemon\graphdriver\overlay\overlay.go的Driver實現了Create,所以會呼叫Driver的Create函式:
// Create is used to create the upper, lower, and merge directories required for overlay fs for a given id.
// The parent filesystem is used to configure these directories for the overlay.
func (d *Driver) Create(id, parent, mountLabel string, storageOpt map[string]string) (retErr error) {
if len(storageOpt) != 0 {
return fmt.Errorf("--storage-opt is not supported for overlay")
}
dir := d.dir(id)
rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps)
if err != nil {
return err
}
//path.Dir(dir)返回除最後一個子目錄外的所有路徑
if err := idtools.MkdirAllAs(path.Dir(dir), 0700, rootUID, rootGID); err != nil {
return err
}
//建一個映象層的獨有目錄
if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
return err
}
defer func() {
// Clean up on failure
if retErr != nil {
os.RemoveAll(dir)
}
}()
// Toplevel images are just a "root" dir
//如果沒有父層,則在映象層目錄下建一個root目錄,並返回
if parent == "" {
if err := idtools.MkdirAs(path.Join(dir, "root"), 0755, rootUID, rootGID); err != nil {
return err
}
return nil
}
//否則建立upper,merged等目錄
logrus.Debugf("Make layer dir")
parentDir := d.dir(parent)
// Ensure parent exists
if _, err := os.Lstat(parentDir); err != nil {
return err
}
// If parent has a root, just do an overlay to it
//如果父映象層有root目錄,則建立upper等目錄
parentRoot := path.Join(parentDir, "root")
//如果父層的root存在,則
if s, err := os.Lstat(parentRoot); err == nil {
if err := idtools.MkdirAs(path.Join(dir, "upper"), s.Mode(), rootUID, rootGID); err != nil {
return err
}
if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil {
return err
}
if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil {
return err
}
if err := ioutil.WriteFile(path.Join(dir, "lower-id"), []byte(parent), 0666); err != nil {
return err
}
return nil
}
// Otherwise, copy the upper and the lower-id from the parent
lowerID, err := ioutil.ReadFile(path.Join(parentDir, "lower-id"))
if err != nil {
return err
}
if err := ioutil.WriteFile(path.Join(dir, "lower-id"), lowerID, 0666); err != nil {
return err
}
parentUpperDir := path.Join(parentDir, "upper")
s, err := os.Lstat(parentUpperDir)
if err != nil {
return err
}
upperDir := path.Join(dir, "upper")
if err := idtools.MkdirAs(upperDir, s.Mode(), rootUID, rootGID); err != nil {
return err
}
if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil {
return err
}
if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil {
return err
}
//這裡應該是把父映象層的所有資料拷貝到子映象層
return copyDir(parentUpperDir, upperDir, 0)
}
閱讀程式碼可以得知,如果有父層(必然也有父層目錄的root目錄,遇到情況似乎都這樣),則會在本映象層目錄建立upper,work,merged,lower-id目錄,然後返回。如果沒有父層,映象層本身是基礎映象層,則直接在本映象層目錄建一個root子資料夾,然後返回。
根據go語言特性,第二步呼叫:
//應用tar包
if err = ls.applyTar(tx, ts, pid, layer); err != nil {
return nil, err
}
呼叫的是docker\daemon\graphdriver\overlay\overlay.go的naiveDiffDriverWithApply的成員ApplyDiff:
// ApplyDiff creates a diff layer with either the NaiveDiffDriver or with a fallback.
func (d *naiveDiffDriverWithApply) ApplyDiff(id, parent string, diff archive.Reader) (int64, error) {
b, err := d.applyDiff.ApplyDiff(id, parent, diff)
if err == ErrApplyDiffFallback {
//初始化在NaiveDiffDriverWithApply函式(47行)
//Driver實現在docker\daemon\graphdriver\fsdiff.go
return d.Driver.ApplyDiff(id, parent, diff)
}
return b, err
}
可以看到naiveDiffDriverWithApply.ApplyDiff首先會嘗試呼叫d.applyDiff.ApplyDiff,如果失敗會呼叫d.Driver.ApplyDiff。
d.applyDiff.ApplyDiff也就是docker\daemon\graphdriver\overlay\overlay.go的Driver的成員函式ApplyDiff:
// ApplyDiff applies the new layer on top of the root, if parent does not exist with will return an ErrApplyDiffFallback error.
func (d *Driver) ApplyDiff(id string, parent string, diff archive.Reader) (size int64, err error) {
dir := d.dir(id)
if parent == "" {
logrus.Debugf("Applied tar on err,no parent")
return 0, ErrApplyDiffFallback
}
logrus.Debugf("Applied tar on parent:%s",parent)
//只有父映象層root存在的的才會繼續往下執行
parentRootDir := path.Join(d.dir(parent), "root")
if _, err := os.Stat(parentRootDir); err != nil {
return 0, ErrApplyDiffFallback
}
// We now know there is a parent, and it has a "root" directory containing
// the full root filesystem. We can just hardlink it and apply the
// layer. This relies on two things:
// 1) ApplyDiff is only run once on a clean (no writes to upper layer) container
// 2) ApplyDiff doesn't do any in-place writes to files (would break hardlinks)
// These are all currently true and are not expected to break
//先生成一個臨時的目錄tmproot
tmpRootDir, err := ioutil.TempDir(dir, "tmproot")
if err != nil {
return 0, err
}
//最後要刪掉upper等臨時目錄
defer func() {
if err != nil {
os.RemoveAll(tmpRootDir)
} else {
os.RemoveAll(path.Join(dir, "upper"))
os.RemoveAll(path.Join(dir, "work"))
os.RemoveAll(path.Join(dir, "merged"))
os.RemoveAll(path.Join(dir, "lower-id"))
}
}()
//tmproot指向了父映象層的root
//將所有位於下層的內容都硬連結到“下層目錄”中
//當應用差異資料時,原來的inode還存在,同名的目錄項指向新的inode
if err = copyDir(parentRootDir, tmpRootDir, copyHardlink); err != nil {
return 0, err
}
options := &archive.TarOptions{UIDMaps: d.uidMaps, GIDMaps: d.gidMaps}
//最終呼叫applyLayerHandler,實現在docker\docker\pkg\chrootarchive\diff_unix.go
//為何去覆蓋父層呢
if size, err = graphdriver.ApplyUncompressedLayer(tmpRootDir, diff, options); err != nil {
return 0, err
}
//搞不懂為何不一開始就命名為 root呢,而是要後來才改為root
rootDir := path.Join(dir, "root")
if err := os.Rename(tmpRootDir, rootDir); err != nil {
return 0, err
}
return
}
對於有父層的映象層,會在映象層目錄建一個tmproot目錄,然後將父層root目錄的所有內容建立硬連結到該目錄,完成後刪除upper等目錄,再改tmproot為root(說實在話這是什麼鬼,建了刪,建了改)。然後將本層的新資料覆蓋父層的硬連結。由linux的硬連結的特性知,對於同名的檔案,檔名(目錄項物件)將指向新的檔案(子層檔案,inode),其他的還是父層檔案(父層inode)。這樣完成了映象層的合併。
對於沒有父層的映象層,這個更簡單,呼叫上述函式將出錯返回,然後呼叫docker\daemon\graphdriver\fsdiff.go的NaiveDiffDriver的成員ApplyDiff:
// ApplyDiff extracts the changeset from the given diff into the
// layer with the specified id and parent, returning the size of the
// new layer in bytes.
func (gdw *NaiveDiffDriver) ApplyDiff(id, parent string, diff archive.Reader) (size int64, err error) {
driver := gdw.ProtoDriver
// Mount the root filesystem so we can apply the diff/layer.
//其實是掛在路徑,如果有root,則直接返回root
//通過ID獲取映象層根目錄即是映象層目錄的root
layerFs, err := driver.Get(id, "")
if err != nil {
return
}
defer driver.Put(id)
options := &archive.TarOptions{UIDMaps: gdw.uidMaps,
GIDMaps: gdw.gidMaps}
start := time.Now().UTC()
logrus.Debug("ApplyUncompressedLayer to:%s",layerFs)
if size, err = ApplyUncompressedLayer(layerFs, diff, options); err != nil {
return
}
logrus.Debugf("Untar time: %vs", time.Now().UTC().Sub(start).Seconds())
return
}
直接在映象目錄建一個root資料夾,將tar包解壓到該資料夾。可以看到docker的overlay驅動處理映象層合併問題是採用將底層映象層的內容建立硬連結到子層的方法,如果底層映象層檔案比較多,而映象又有很多層,會出現什麼問題?因為檔案系統劃分的元資料區大小是有限的,每一個新層就要建立底層的檔案的硬連結,硬連結也就是目錄項物件,這些目錄項物件由目錄(特殊檔案,也即是inode)集合,是儲存在元資料區的,這樣檔案系統資料區還沒有使用完,產生很多的inode佔用完元資料區—這就是inode耗盡問題