雲實例初始化工具cloud-init原始碼分析
阿新 • • 發佈:2022-05-17
原始碼分析
程式碼結構
cloud-init的程式碼結構如下:
cloud-init ├── bash_completion # bash自動補全檔案 │ └── cloud-init ├── ChangeLog # 更新日誌 ├── cloudinit │ ├── cloud.py # Cloud類 │ ├── cmd # 命令列操作目錄 │ │ ├── clean.py # cloud-init clean │ │ ├── cloud_id.py # cloud-id │ │ ├── devel # cloud-init devel │ │ ├── __init__.py │ │ ├── main.py # cloud-init init/modules │ │ ├── query.py # cloud-init query │ │ └── status.py # cloud-init status │ ├── config # 模組目錄 │ ├── distros # 系統發行版目錄 │ ├── handlers # 模板渲染處理函式目錄 │ ├── helpers.py # 幫助函式 │ ├── __init__.py │ ├── log.py # 日誌處理 │ ├── mergers # 配置合併目錄 │ ├── net # 網路目錄 │ ├── settings.py # 內建配置 │ ├── sources #資料來源目錄 │ ├── stages.py # Init類 │ ├── ... │ └── warnings.py ├── doc # 文件 │ ├── ... ├── packages # 各大Linux發行包製作指令碼 │ ├── ... ├── README.md # 簡介 ├── requirements.txt # 依賴包 ├── rhel # 針對redhat系linux發行版的補丁 │ ├── cloud.cfg │ ├── cloud-init-tmpfiles.conf │ ├── README.rhel │ └── systemd ├── setup.py # python模組安裝檔案 ├── tests # 單元測試目錄 │ ├── ... ├── tools # 額外工具 │ ├── ... └── tox.ini # tox配置檔案 44 directories, 127 files
cloud-init的函式入口點位於cloudinit/cmd/main.py,該檔案包含了所有cloud-init執行階段的邏輯程式碼。
cloud-init init [ --local ]
cmd/main.py,main_init:init階段執行此主函式,分析如下:
def main_init(name, args): # deps變數,local階段為NETWORK,network階段為FILESYSTEM、NETWORK deps = [sources.DEP_FILESYSTEM, sources.DEP_NETWORK] if args.local: deps = [sources.DEP_FILESYSTEM] # 標準輸出log early_logs = [attempt_cmdline_url( path=os.path.join("%s.d" % CLOUD_CONFIG, "91_kernel_cmdline_url.cfg"), network=not args.local)] # 決定歡迎語,local階段為init-local,network階段為init if not args.local: w_msg = welcome_format(name) else: # Cloud-init v. 19.4 running 'init-local' at Wed, 15 Dec 2021 03:00:40 +0000. Up 250.20 seconds. w_msg = welcome_format("%s-local" % (name)) # 例項化stages.Init類 init = stages.Init(ds_deps=deps, reporter=args.reporter) # Stage 1 # 載入配置檔案,優先順序從低到高為:內建配置 --> /etc/cloud/cloud.cfg{,.d} --> /run/cloud-init/cloud.cfg --> kernel cmdline init.read_cfg(extract_fns(args)) # Stage 2 # 重定向輸出和錯誤 outfmt = None errfmt = None try: early_logs.append((logging.DEBUG, "Closing stdin.")) util.close_stdin() (outfmt, errfmt) = util.fixup_output(init.cfg, name) except Exception: msg = "Failed to setup output redirection!" util.logexc(LOG, msg) print_exc(msg) early_logs.append((logging.WARN, msg)) if args.debug: # Reset so that all the debug handlers are closed out LOG.debug(("Logging being reset, this logger may no" " longer be active shortly")) logging.resetLogging() logging.setupLogging(init.cfg) apply_reporting_cfg(init.cfg) # Any log usage prior to setupLogging above did not have local user log # config applied. We send the welcome message now, as stderr/out have # been redirected and log now configured. # 輸出歡迎語 welcome(name, msg=w_msg) # re-play early log messages before logging was setup for lvl, msg in early_logs: LOG.log(lvl, msg) # Stage 3 try: # 建立cloud-init相關的目錄和檔案,包括/var/lib/cloud/目錄下的各個子目錄,以及日誌檔案 init.initialize() except Exception: util.logexc(LOG, "Failed to initialize, likely bad things to come!") # Stage 4 # 判斷manual_cache_clean配置項,如果為false,cloudinit會通過例項id判斷當前執行的例項是否為新例項;否則不作判斷,可能導致例項遷移後per-instance模組不執行 # local階段,刪除快取(boot_finished、no-net) # network階段,判斷no-net檔案是否存在,如存在則提前退出 path_helper = init.paths mode = sources.DSMODE_LOCAL if args.local else sources.DSMODE_NETWORK if mode == sources.DSMODE_NETWORK: existing = "trust" sys.stderr.write("%s\n" % (netinfo.debug_info())) LOG.debug(("Checking to see if files that we need already" " exist from a previous run that would allow us" " to stop early.")) # no-net is written by upstart cloud-init-nonet when network failed # to come up stop_files = [ os.path.join(path_helper.get_cpath("data"), "no-net"), ] existing_files = [] for fn in stop_files: if os.path.isfile(fn): existing_files.append(fn) if existing_files: LOG.debug("[%s] Exiting. stop file %s existed", mode, existing_files) return (None, []) else: LOG.debug("Execution continuing, no previous run detected that" " would allow us to stop early.") else: existing = "check" mcfg = util.get_cfg_option_bool(init.cfg, 'manual_cache_clean', False) if mcfg: LOG.debug("manual cache clean set from config") existing = "trust" else: mfile = path_helper.get_ipath_cur("manual_clean_marker") if os.path.exists(mfile): LOG.debug("manual cache clean found from marker: %s", mfile) existing = "trust" init.purge_cache() # Delete the no-net file as well util.del_file(os.path.join(path_helper.get_cpath("data"), "no-net")) # Stage 5 # 從資料來源中獲取資料。根據obj.pkl快取檔案是否存在、existing變數、instance_id是否與/run/cloud-init/instance-id一致判斷是否從快取載入資料, # 否則遍歷所有資料來源,選擇能夠第一個能夠獲取資料的資料來源當作本例項資料來源 # s.update_metadata([EventType.BOOT_NEW_INSTANCE]),_get_data try: init.fetch(existing=existing) # if in network mode, and the datasource is local # then work was done at that stage. # network階段下,如果資料來源的dsmode不為network,則直接結束 if mode == sources.DSMODE_NETWORK and init.datasource.dsmode != mode: LOG.debug("[%s] Exiting. datasource %s in local mode", mode, init.datasource) return (None, []) except sources.DataSourceNotFoundException: # In the case of 'cloud-init init' without '--local' it is a bit # more likely that the user would consider it failure if nothing was # found. When using upstart it will also mentions job failure # in console log if exit code is != 0. if mode == sources.DSMODE_LOCAL: LOG.debug("No local datasource found") else: util.logexc(LOG, ("No instance datasource found!" " Likely bad things to come!")) if not args.force: init.apply_network_config(bring_up=not args.local) LOG.debug("[%s] Exiting without datasource", mode) if mode == sources.DSMODE_LOCAL: return (None, []) else: return (None, ["No instance datasource found."]) else: LOG.debug("[%s] barreling on in force mode without datasource", mode) # 如果資料來源是從快取恢復的,且instance-data.json檔案缺失,則恢復它 _maybe_persist_instance_data(init) # Stage 6 # 生成/var/lib/cloud/<instance_id>/軟連結,建立handlers, scripts, sem目錄,寫入datasource檔案 # 在/var/lib/cloud/data/目錄下寫入previous-datasource、instance-id、previous-instance-id檔案 # 在/run/cloud-init/目錄下寫入.instance_id檔案 # 若manual_cache_clean配置項為true,寫入/var/lib/cloud/<instance_id>/manual_clean_marker檔案 # 寫入obj.pkl # 重新整理init例項的配置 iid = init.instancify() LOG.debug("[%s] %s will now be targeting instance id: %s. new=%s", mode, name, iid, init.is_new_instance()) if mode == sources.DSMODE_LOCAL: # Before network comes up, set any configured hostname to allow # dhcp clients to advertize this hostname to any DDNS services # LP: #1746455. _maybe_set_hostname(init, stage='local', retry_stage='network') # 應用網路配置,network階段會拉起網路 # 若存在/var/lib/cloud/data/upgraded-network檔案,則直接返回 # netcfg=self.datasource.network_config # self._apply_netcfg_names(netcfg) # self.distro.apply_network_config(netcfg, bring_up=bring_up) init.apply_network_config(bring_up=bool(mode != sources.DSMODE_LOCAL)) # local階段下,如果資料來源的dsmode不為local,則直接返回 if mode == sources.DSMODE_LOCAL: if init.datasource.dsmode != mode: LOG.debug("[%s] Exiting. datasource %s not in local mode.", mode, init.datasource) return (init.datasource, []) else: LOG.debug("[%s] %s is in local mode, will apply init modules now.", mode, init.datasource) # Give the datasource a chance to use network resources. # This is used on Azure to communicate with the fabric over network. # 在uesr-data和vendor-data處理之前呼叫,用於網路啟動後再次更新資料來源,目前僅用於azure獲取fabric資料並填充進fabric_data # 呼叫self.datasource.setup(is_new_instance=self.is_new_instance()) init.setup_datasource() # update fully realizes user-data (pulling in #include if necessary) # 儲存與渲染userdata和vendor_data # _store_userdata(),在/var/lib/cloud/instance/目錄下寫入user-data.txt、user-data.txt.i # _store_vendordata(),在/var/lib/cloud/instance/目錄下寫入vendor-data.txt、vendor-data.txt.i init.update() _maybe_set_hostname(init, stage='init-net', retry_stage='modules:config') # Stage 7 try: # Attempt to consume the data per instance. # This may run user-data handlers and/or perform # url downloads and such as needed. # 消費uesr_data和vendor_data # allow_userdata不為false的話,執行_consume_userdata(PER_INSTANCE),reading and applying user-data # 在/var/lib/cloud/instance/目錄下寫入cloud-config.txt # 執行_consume_vendordata(PER_INSTANCE),vendor data will be consumed # 在/var/lib/cloud/instance/目錄下寫入vendor-cloud-config.txt # 在/var/lib/cloud/instance/scripts/vendor/目錄下寫入vendor_data指令碼 (ran, _results) = init.cloudify().run('consume_data', init.consume_data, args=[PER_INSTANCE], freq=PER_INSTANCE) if not ran: # Just consume anything that is set to run per-always # if nothing ran in the per-instance code # # See: https://bugs.launchpad.net/bugs/819507 for a little # reason behind this... init.consume_data(PER_ALWAYS) except Exception: util.logexc(LOG, "Consuming user data failed!") return (init.datasource, ["Consuming user data failed!"]) apply_reporting_cfg(init.cfg) # Stage 8 - re-read and apply relevant cloud-config to include user-data # 例項化Modules類 # 合併所有cloud-config,包括:/etc/cloud/cloud.cfg{,.d},/run/cloud-init/cloud.cfg,/proc/cmdline,/var/lib/cloud/instance/cloud-config.txt,/var/lib/cloud/instance/vendor-cloud-config.txt mods = stages.Modules(init, extract_fns(args), reporter=args.reporter) # Stage 9 try: # 使用mods物件再次重定向日誌輸出 outfmt_orig = outfmt errfmt_orig = errfmt (outfmt, errfmt) = util.get_output_cfg(mods.cfg, name) if outfmt_orig != outfmt or errfmt_orig != errfmt: LOG.warning("Stdout, stderr changing to (%s, %s)", outfmt, errfmt) (outfmt, errfmt) = util.fixup_output(mods.cfg, name) except Exception: util.logexc(LOG, "Failed to re-adjust output redirection!") logging.setupLogging(mods.cfg) # give the activated datasource a chance to adjust # 呼叫self.datasource.activate,該方法在user-data和vendor-data渲染後,init_modules執行前呼叫 # 寫入/var/lib/cloud/instance/obj.pkl init.activate_datasource() di_report_warn(datasource=init.datasource, cfg=init.cfg) # Stage 10 # 執行init_modules return (init.datasource, run_module_section(mods, name, name))
cloud-init modules
cmd/main.py, main_modules(),cloud-init在config和final階段會執行該函式,分析如下:
def main_modules(action_name, args): # config或final name = args.mode # Cloud-init v. 19.4 running 'modules:config' at Wed, 15 Dec 2021 03:01:15 +0000. Up 280.96 seconds. w_msg = welcome_format("%s:%s" % (action_name, name)) # 例項化Init類 init = stages.Init(ds_deps=[], reporter=args.reporter) # Stage 1 # 載入配置檔案,優先順序從低到高為:內建配置 --> /etc/cloud/clouf.cfg{,.d} --> /run/cloud-init/cloud.cfg --> kernel cmdline init.read_cfg(extract_fns(args)) # Stage 2 try: # 從資料來源中獲取資料。當obj.pkl快取檔案存在,則從快取載入資料, # 否則遍歷所有資料來源,選擇能夠第一個能夠獲取資料的資料來源當作本例項資料來源 # s.update_metadata([EventType.BOOT_NEW_INSTANCE]),_get_data init.fetch(existing="trust") except sources.DataSourceNotFoundException: # There was no datasource found, theres nothing to do msg = ('Can not apply stage %s, no datasource found! Likely bad ' 'things to come!' % name) util.logexc(LOG, msg) print_exc(msg) if not args.force: return [(msg)] # 如果資料來源是從快取恢復的,且instance-data.json檔案缺失,則恢復它 _maybe_persist_instance_data(init) # Stage 3 # 例項化Modules類 mods = stages.Modules(init, extract_fns(args), reporter=args.reporter) # Stage 4 # 重定向標準輸出到日誌檔案 try: LOG.debug("Closing stdin") util.close_stdin() util.fixup_output(mods.cfg, name) except Exception: util.logexc(LOG, "Failed to setup output redirection!") if args.debug: # Reset so that all the debug handlers are closed out LOG.debug(("Logging being reset, this logger may no" " longer be active shortly")) logging.resetLogging() logging.setupLogging(mods.cfg) apply_reporting_cfg(init.cfg) # now that logging is setup and stdout redirected, send welcome welcome(name, msg=w_msg) # Stage 5 # 執行各個模組 return run_module_section(mods, name, name)
針對redhat系的定製
收錄於centos yum倉庫的cloud-init是定製的版本,在開源cloud-init的基礎上合入了一系列針對redhat系linux的patch。目前收錄如下:
- Add initial redhat setup。此補丁包含多個補丁,主要包含對預設配置的改動,例如將system_info.distro更改為rhel,新增預設cloud.cfg配置檔案,以及新增一系列systemd服務配置檔案
- Do not write NM_CONTROLLED=no in generated interface config files。
- limit permissions on def_log_file。新增日誌檔案使用者許可權配置選項def_log_file_mode,且設定其預設值為0600
- sysconfig: Don't write BOOTPROTO=dhcp for ipv6 dhcp。
- DataSourceAzure.py: use hostnamectl to set hostname。
- include 'NOZEROCONF=yes' in /etc/sysconfig/network。雲上例項需要使用該配置
- Remove race condition between cloud-init and NetworkManager。移除systemd服務中對NetworkManager的競爭,設定ssh_deletekeys為1
- net: exclude OVS internal interfaces in get_interfaces。
- Fix requiring device-number on EC2 derivatives。
- rhel/cloud.cfg: remove ssh_genkeytypes in settings.py and set in cloud.cfg。在cloud.cfg中新增 ssh_genkeytypes,首次開機啟動時生成公私鑰
- write passwords only to serial console, lock down cloud-init-output.log。
- ssh-util: allow cloudinit to merge all ssh keys into a custom user file, defined in AuthorizedKeysFile。
- Stop copying ssh system keys and check folder permissions。
- Fix home permissions modified by ssh module (SC-338)。
- ssh_utils.py: ignore when sshd_config options are not key/value pairs。
- cc_ssh.py: fix private key group owner and permissions。
針對OpenStack的變更
- 基於centos8 cloud-init-21.1-9.el8.src.rpm作變更,此包基於開源cloud-init 21.1版本合入了多個redhat系linux定製的patch。
- cloud-init的detect_openstack方法(cloudinit\sources\DataSourceOpenStack.py)檢測例項是否位於OpenStack,以判斷是否應用openstack資料來源。由於裸金屬例項無法判斷是否屬於OpenStack,需修改此方法,直接返回true。
- ...