1. 程式人生 > >Ceph Crush 演算法原始碼分析

Ceph Crush 演算法原始碼分析

簡介:

Ceph Crush演算法是Ceph分散式系統中用於資料分佈(定位)的核心演算法,其核心元件有crush rule、bucket algorithm。crush rule是可以自定義的選擇過程,bucket algorithm是從bucket選取item時使用的演算法,該演算法需要的主要引數有:placement seed(pgid)、crush map、副本數等。本文將簡要介紹Ceph Crush演算法的實現。

先來看一個crush map的簡單例項:

{
//devices
    "devices": [
        {
            "id": 0,
            "name"
: "osd.0" }, { "id": 1, "name": "osd.1" }, ... { "id": 9, "name": "osd.9" } ], //type "types": [ { "type_id": 0, "name": "osd" }, { "type_id"
: 1, "name": "host" }, ... { "type_id": 10, "name": "root" } ], //buckets "buckets": [ { "id": -1, "name": "default", "type_id": 10, "type_name": "root", "weight"
: 821160, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -2, "weight": 142868, "pos": 0 }, { "id": -3, "weight": 142868, "pos": 1 }, { "id": -8, "weight": 178910, "pos": 2 }, { "id": -10, "weight": 356514, "pos": 3 } ] }, { "id": -2, "name": "ceph-osd-240", "type_id": 1, "type_name": "host", "weight": 142868, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 0, "weight": 142868, "pos": 0 } ] }, { "id": -3, "name": "ceph-osd-241", "type_id": 1, "type_name": "host", "weight": 142868, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 1, "weight": 142868, "pos": 0 } ] }, { "id": -8, "name": "ceph-osd-66", "type_id": 1, "type_name": "host", "weight": 178910, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 2, "weight": 36044, "pos": 0 }, { "id": 3, "weight": 32768, "pos": 1 }, { "id": 4, "weight": 3276, "pos": 2 }, { "id": 5, "weight": 34078, "pos": 3 }, { "id": 6, "weight": 36044, "pos": 4 }, { "id": 7, "weight": 36700, "pos": 5 } ] }, { "id": -10, "name": "ceph-osd-253", "type_id": 1, "type_name": "host", "weight": 356514, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 8, "weight": 178257, "pos": 0 }, { "id": 9, "weight": 178257, "pos": 1 } ] } ], //crush rule "rules": [ { "rule_id": 0, "rule_name": "replicated_ruleset", "ruleset": 0, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] } ], //相關的可調控的配置引數 "tunables": { "choose_local_tries": 0, "choose_local_fallback_tries": 0, "choose_total_tries": 50, "chooseleaf_descend_once": 1, "chooseleaf_vary_r": 0, "straw_calc_version": 1, "allowed_bucket_algs": 22, "profile": "unknown", "optimal_tunables": 0, "legacy_tunables": 0, "require_feature_tunables": 1, "require_feature_tunables2": 1, "require_feature_tunables3": 0, "has_v2_rules": 0, "has_v3_rules": 0, "has_v4_buckets": 0 } }

Crush演算法實現中構造的主要資料結構:

crush rule中的step op codes
/* step op codes */             
enum {                          
    CRUSH_RULE_NOOP = 0,
    CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
    CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
                ¦   ¦ /* arg2 = type */
    CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
    CRUSH_RULE_EMIT = 4,          /* no args */                                                                                                                                                                   
    CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
    CRUSH_RULE_CHOOSELEAF_INDEP = 7,

    CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
    CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
    CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
    CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
    CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
}; 


crush_map結構體
/*
 * CRUSH map includes all buckets, rules, etc.
 */
struct crush_map {                                                                                                                                                                                                
    struct crush_bucket **buckets;
    struct crush_rule **rules;

    __s32 max_buckets;
    __u32 max_rules;
    __s32 max_devices;

    /* choose local retries before re-descent */
    __u32 choose_local_tries;
    /* choose local attempts using a fallback permutation before
    ¦* re-descent */
    __u32 choose_local_fallback_tries;
    /* choose attempts before giving up */ 
    __u32 choose_total_tries;
    /* attempt chooseleaf inner descent once for firstn mode; on
    ¦* reject retry outer descent.  Note that this does *not*
    ¦* apply to a collision: in that case we will retry as we used
    ¦* to. */
    __u32 chooseleaf_descend_once;

    /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
    ¦* bits.  a value of 1 is best for new clusters.  for legacy clusters
    ¦* that want to limit reshuffling, a value of 3 or 4 will make the
    ¦* mappings line up a bit better with previous mappings. */
    __u8 chooseleaf_vary_r;

    /*
    ¦* version 0 (original) of straw_calc has various flaws.  version 1
    ¦* fixes a few of them.
    ¦*/
    __u8 straw_calc_version;
    /*
    ¦* allowed bucket algs is a bitmask, here the bit positions
    ¦* are CRUSH_BUCKET_*.  note that these are *bits* and
    ¦* CRUSH_BUCKET_* values are not, so we need to or together (1
    ¦* << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
    ¦* minimize confusion (bucket type values start at 1).
    ¦*/
    __u32 allowed_bucket_algs;

    __u32 *choose_tries;
};

/*
 * CRUSH uses user-defined "rules" to describe how inputs should be
 * mapped to devices.  A rule consists of sequence of steps to perform
 * to generate the set of output devices.
 */
struct crush_rule_step {
    __u32 op;
    __s32 arg1;
    __s32 arg2;
};


/*
 * The rule mask is used to describe what the rule is intended for.
 * Given a ruleset and size of output set, we search through the
 * rule list for a matching rule_mask.
 */
struct crush_rule_mask {
    __u8 ruleset;
    __u8 type;
    __u8 min_size;
    __u8 max_size;
};

crush_bucket結構體:
struct crush_bucket {
    __s32 id;        /* this'll be negative */
    __u16 type;      /* non-zero; type=0 is reserved for devices */
    __u8 alg;        /* one of CRUSH_BUCKET_* */
    __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
    __u32 weight;    /* 16-bit fixed point */
    __u32 size;      /* num items */
    __s32 *items;

    /*
    ¦* cached random permutation: used for uniform bucket and for
    ¦* the linear search fallback for the other bucket types.
    ¦*/
    __u32 perm_x;  /* @x for which *perm is defined */
    __u32 perm_n;  /* num elements of *perm that are permuted/defined */
    __u32 *perm;
};

//crush_rule的結構體,表示pg對映的策略
struct crush_rule {
    __u32 len;
    struct crush_rule_mask mask;
    struct crush_rule_step steps[0];
};


struct crush_bucket_straw {
    struct crush_bucket h;
    __u32 *item_weights;   /* 16-bit fixed point */                                                                                                                                                               
    __u32 *straws;         /* 16-bit fixed point */
};

程式碼簡析

Crush 演算法入口


/*
 * map raw pg (full precision ps) into a placement seed.  include
 * pool id in that value so that different pools don't use the same
 * seeds.
 */
ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
{
  if (flags & FLAG_HASHPSPOOL) {
    // Hash the pool id so that pool PGs do not overlap.
    return
      crush_hash32_2(CRUSH_HASH_RJENKINS1,                                                                                                                                                                        
                     ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
                     pg.pool());
  } else {
    // Legacy behavior; add ps and pool together.  This is not a great
    // idea because the PGs from each pool will essentially overlap on
    // top of each other: 0.5 == 1.4 == 2.3 == ...
    return
      ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
      pg.pool();
  }
}

//將PG對映到一組OSDS
int OSDMap::_pg_to_osds(const pg_pool_t& pool, pg_t pg,
                        vector<int> *osds, int *primary,
                        ps_t *ppps) const
{
  // map to osds[]
  ps_t pps = pool.raw_pg_to_pps(pg);  // placement ps
  //獲取pool的replicated size
  unsigned size = pool.get_size();

  // what crush rule? 獲取pool使用crush rule
  int ruleno = crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), size);
  if (ruleno >= 0)
    crush->do_rule(ruleno, pps, *osds, size, osd_weight);
  //刪除不存在的osd
  _remove_nonexistent_osds(pool, *osds);

  *primary = -1;
  //選取primary osd(第一個作為primary osd)
  for (unsigned i = 0; i < osds->size(); ++i) {
    if ((*osds)[i] != CRUSH_ITEM_NONE) {
      *primary = (*osds)[i];
      break;
    }  
  }
  if (ppps)
    *ppps = pps; 

  return osds->size();
}

void do_rule(int rule, int x, vector<int>& out, int maxout,                                                                                                                           
      ¦ ¦ ¦ ¦const vector<__u32>& weight) const {                                                                                                                                    
¦ Mutex::Locker l(mapper_lock);                                                                                                                                                      
¦ int rawout[maxout];                                                                                                                                                                
¦ int scratch[maxout * 3];  
//開始crush過程:
//crush: crush map; rule:ruleset;x:placement seed; maxout:副本數;rawout:存放結果的資料                                                                                                                                                        
¦ int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0], weight.size(), scratch);                                                                                                               
¦ if (numrep < 0)                                                                                                                                                                    
¦ ¦ numrep = 0;                                                                                                                                                                      
¦ out.resize(numrep);                                                                                                                                                                
¦ for (int i=0; i<numrep; i++)                                                                                                                                                       
¦ ¦ out[i] = rawout[i];                                                                                                                                                              
}

函式crush_do_rule

/**
 * crush_do_rule - calculate a mapping with the given input and rule
 * @map: the crush_map //crush map 包含了device、type、buckets、rules等。
 * @ruleno: the rule id //當前pool所使用的rule規則ruleset
 * @x: hash input  //placement seed
 * @result: pointer to result vector //用於存放選中的osd。
 * @result_max: maximum result size //需要選擇的osd個數
 * @weight: weight vector (for map leaves)
 * @weight_max: size of weight vector
 * @scratch: scratch vector for private use; must be >= 3 * result_max
 */
int crush_do_rule(const struct crush_map *map,
                ¦ int ruleno, int x, int *result, int result_max,
                ¦ const __u32 *weight, int weight_max,
                ¦ int *scratch)
{
        int result_len;
        int *a = scratch;
        int *b = scratch + result_max;
        int *c = scratch + result_max*2;
        int recurse_to_leaf;
        int *w;
        int wsize = 0;
        int *o;
        int osize;
        int *tmp;
        struct crush_rule *rule;
        __u32 step;
        int i, j;
        int numrep;
        int out_size;
        /*
        ¦* the original choose_total_tries value was off by one (it
        ¦* counted "retries" and not "tries").  add one.
        ¦*/
        int choose_tries = map->choose_total_tries + 1;
        int choose_leaf_tries = 0;
        /*
        ¦* the local tries values were counted as "retries", though,
        ¦* and need no adjustment
        ¦*/
        int choose_local_retries = map->choose_local_tries;
        int choose_local_fallback_retries = map->choose_local_fallback_tries;

        int vary_r = map->chooseleaf_vary_r;

        if ((__u32)ruleno >= map->max_rules) {
                dprintk(" bad ruleno %d\n", ruleno);
                return 0;
        }
//選擇當前pool使用的rule
        rule = map->rules[ruleno];
        result_len = 0;
        w = a;
        o = b;

        for (step = 0; step < rule->len; step++) {
                int firstn = 0;
                struct crush_rule_step *curstep = &rule->steps[step];

                switch (curstep->op) {
                case CRUSH_RULE_TAKE:
                //選擇的是device或者是bucket,注:bucket的id使用負值
                        if ((curstep->arg1 >= 0 &&
                             curstep->arg1 < map->max_devices) ||
                            (-1-curstep->arg1 >= 0 &&
                             -1-curstep->arg1 < map->max_buckets &&
                             map->buckets[-1-curstep->arg1])) {
                                w[0] = curstep->arg1;
                                wsize = 1;
                        } else {
                                dprintk(" bad take value %d\n", curstep->arg1);
                        }
                        break;

                case CRUSH_RULE_SET_CHOOSE_TRIES:
                        if (curstep->arg1 > 0)
                                choose_tries = curstep->arg1;
                        break;

                case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
                        if (curstep->arg1 > 0)
                                choose_leaf_tries = curstep->arg1;
                        break;

                case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
                        if (curstep->arg1 >= 0)
                                choose_local_retries = curstep->arg1;
                        break;

                case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
                        if (curstep->arg1 >= 0)
                                choose_local_fallback_retries = curstep->arg1;
                        break;

                case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
                        if (curstep->arg1 >= 0)
                                vary_r = curstep->arg1;
                        break;

                case CRUSH_RULE_CHOOSELEAF_FIRSTN:
                case CRUSH_RULE_CHOOSE_FIRSTN:
                        firstn = 1;
                        /* fall through */
                case CRUSH_RULE_CHOOSELEAF_INDEP:
                case CRUSH_RULE_CHOOSE_INDEP:
                        if (wsize == 0)                                                                                                                                                                           
                                break;
                                //決定是否遞迴的選擇item(要求最終選擇的item的型別為item(device))
                        recurse_to_leaf =
                                curstep->op ==
                                 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
                                curstep->op ==
                                CRUSH_RULE_CHOOSELEAF_INDEP;

                        /* reset output */
                        osize = 0;

                        for (i = 0; i < wsize; i++) {
                                int bno;
                                /*
                                 * see CRUSH_N, CRUSH_N_MINUS macros.
                                 * basically, numrep <= 0 means relative to
                                 * the provided result_max
                                 */
                                 //該step選擇的item(buckets/devices)數;
                                 //如果指定的數是大於零的數,則選擇指定的item數,否則選擇(numrep += result_max)(不小於0)個item
                                numrep = curstep->arg1;
                                if (numrep <= 0) {
                                        numrep += result_max;
                                        if (numrep <= 0)
                                                continue;
                                }
                                j = 0;
                                /* make sure bucket id is valid */
                                bno = -1 - w[i];
                                if (bno < 0 || bno >= map->max_buckets) {
                                        // w[i] is probably CRUSH_ITEM_NONE
                                        dprintk("  bad w[i] %d\n", w[i]);
                                        continue;
                                }
                                if (firstn) {
                                //recurse_tries 遞迴選擇leaf item的次數。(貌似該變數沒有真正使用)
                                        int recurse_tries;
                                        if (choose_leaf_tries)
                                                recurse_tries =
                                                        choose_leaf_tries;
                                        else if (map->chooseleaf_descend_once)                                                                                                                                    
                                                recurse_tries = 1;
                                        else
                                                recurse_tries = choose_tries;
                                                //在某bucket下選擇指定的數量的item(buckets/devices)
                                        osize += crush_choose_firstn(
                                                map,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, numrep,
                                                curstep->arg2,
                                                o+osize, j,
                                                result_max-osize,
                                                choose_tries,
                                                recurse_tries,
                                                choose_local_retries,
                                                choose_local_fallback_retries,
                                                recurse_to_leaf,
                                                vary_r,
                                                c+osize,
                                                0);
                                } else {
                                        out_size = ((numrep < (result_max-osize)) ?
  ¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦                               numrep : (result_max-osize));
                                        crush_choose_indep(
                                                map,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, out_size, numrep,
                                                curstep->arg2,
                                                o+osize, j,
                                                choose_tries,
                                                choose_leaf_tries ,
                                                   choose_leaf_tries : 1,
                                                recurse_to_leaf,
                                                c+osize,
                                                0);
                                        osize += out_size;
                                }
                        }
                        //如果recurse_to_leaf為true,則將遞迴選中的item放入o vector中。
                        if (recurse_to_leaf)
                                /* copy final _leaf_ values to output set */
                                memcpy(o, c, osize*sizeof(*o));                                                                                                                                                   

                        /* swap o and w arrays */ //把o中選中的結果,轉交給w(w指向選擇的結果)
                        tmp = o;
                        o = w;
                        w = tmp;
                        wsize = osize;
                        break;

                //crush rule step的結束操作,將最終的結果都拷貝到result vector中
                case CRUSH_RULE_EMIT:
                        for (i = 0; i < wsize && result_len < result_max; i++) {
                                result[result_len] = w[i];
                                result_len++;
                        }
                        wsize = 0;
                        break;

                default:
                        dprintk(" unknown op %d at step %d\n",
                                curstep->op, step);
                        break;
                }
        }
        return result_len;
}           

就上文中crush map的例項中的rule規則結合程式碼實現過程,可以知道,首先第一步take,從default開始選擇,其id為-4;然後進入第二步chooseleaf_firstn,相應的呼叫crush_choose_firstn函式,在default之下繼續,該步選擇bucket的型別為host,選擇的item數為0(如果是0,則選擇副本數個item,如果大於0,則選擇指定個數的item,小於0則與副本數求和,其和作為item的個數,如果和也小於0則失敗),並且recurse_to_leaf會被置為true,表示會遞迴的選擇到osd device為止;最後rule 結束標誌,將最終的結果儲存到result vector中。 與chooseleaf_firstn非常相似的是choose_firstn,該step只會選擇指定個數,指定型別的bucket/device。

注:scratch該引數被分成三部分(以副本數等分)用於不同的邏輯中,第一份用於存放step的最終結果,第二部分用於存放crush_choose_firstn的邏輯結果,第三部分用於crush_choose_firstn遞迴呼叫邏輯。

函式:crush_choose_firstn

/**
 * crush_choose_firstn - choose numrep distinct items of given type
 * @map: the crush_map
 * @bucket: the bucket we are choose an item from
 * @x: crush input value
 * @numrep: the number of items to choose
 * @type: the type of item to choose
 * @out: pointer to output vector
 * @outpos: our position in that vector
 * @out_size: size of the out vector
 * @tries: number of attempts to make
 * @recurse_tries: number of attempts to have recursive chooseleaf make
 * @local_retries: localized retries
 * @local_fallback_retries: localized fallback retries
 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
 * @vary_r: pass r to recursive calls
 * @out2: second output vector for leaf items (if @recurse_to_leaf) //需要遞迴選擇osd type item時,會將選中的item放入該vector中
 * @parent_r: r value passed from the parent
 */
static int crush_choose_firstn(const struct crush_map *map,
                               struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int numrep, int type,
                               int *out, int outpos,
                               int out_size,
                               unsigned int tries,
                               unsigned int recurse_tries,
                               unsigned int local_retries,
                               unsigned int local_fallback_retries,
                               int recurse_to_leaf,
                               unsigned int vary_r,
                               int *out2,
                               int parent_r)
{
{
        int rep;
        unsigned int ftotal, flocal;
        int retry_descent, retry_bucket, skip_rep;
        struct crush_bucket *in = bucket;
        int r;
        int i;
        int item = 0;
        int itemtype;
        int collide, reject;
        int count = out_size;

        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
                recurse_to_leaf ? "_LEAF" : "",
                bucket->id, x, outpos, numrep,
                tries, recurse_tries, local_retries, local_fallback_retries,
                parent_r);
        //迴圈選取副本數個osds
        for (rep = outpos; rep < numrep && count > 0 ; rep++) {
                /* keep trying until we get a non-out, non-colliding item */
                ftotal = 0;
                skip_rep = 0;
                do {
                        retry_descent = 0;
                        in = bucket;               /* initial bucket */

                        /* choose through intervening buckets */
                        flocal = 0;
                        do {
                                collide = 0;
                                retry_bucket = 0;
                                r = rep + parent_r;
                                /* r' = r + f_total */
                                r += ftotal;

                                /* bucket choose */
                                if (in->size == 0) {
                                        reject = 1;
                                        goto reject;
                                }                                                                                                                                                                                 
                                if (local_fallback_retries > 0 &&
                                    flocal >= (in->size>>1) &&
                                    flocal > local_fallback_retries)
                                        item = bucket_perm_choose(in, x, r);
                                else
                                //在某bucket(in)中選擇item,並返回該項
                                        item = crush_bucket_choose(in, x, r);
                                //檢查選中的項是否合法
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        skip_rep = 1;
                                        break; 
                                }

                                /* desired type? */
                                if (item < 0)
                                //選中的item(bucket/device)的型別
                                        itemtype = map->buckets[-1-item]->type;
                                else
                                        itemtype = 0;
                                dprintk("  item %d type %d\n", item, itemtype);

                                /* keep going? */
                                //如果選中的item不是指定的型別,同時該item不是bucket,則選擇失敗,否則在該選中的bucket中繼續選擇
                                if (itemtype != type) {
                                        if (item >= 0 ||
                                            (-1-item) >= map->max_buckets) {
                                                dprintk("   bad item type %d\n", type);
                                                skip_rep = 1;
                                                break;
                                        }
                                        in = map->buckets[-1-item];
                                        retry_bucket = 1;
                                        continue;
                                }

                                /* collision? */ //判斷當前選擇的item與之前選中的item是否重複(衝突)。
                                for (i = 0; i < outpos; i++) {
                                        if (out[i] == item) {
                                                collide = 1;
                                                break;
                                        }
                                }

                                reject = 0;             
                                //如果當前選中的item,跟之前選擇的不存在衝突,且該次step是choose leaf,則進入如下處理(遞迴呼叫crush_choose_firstn),否則跳過                             
                                if (!collide && recurse_to_leaf) {
                                //如果選中的是bucket者繼續(遞迴)呼叫crush_choose_firstn
                                        if (item < 0) {
                                                int sub_r;
                                                if (vary_r)
                                                        sub_r = r >> (vary_r-1);
                                                else
                                                        sub_r = 0;
                                                if (crush_choose_firstn(map,
                                                         map->buckets[-1-item],
                                                         weight, weight_max,
                                                         x, outpos+1, 0,
                                                         out2, outpos, count,
                                                         recurse_tries, 0,
                                                         local_retries,
                                                         local_fallback_retries,
                                                         0,
                                                         vary_r,
                                                         NULL,
                                                         sub_r) <= outpos)
                                                        /* didn't get leaf */
                                                        reject = 1;
                                        } else {
                                                /* we already have a leaf! */
                                                out2[outpos] = item;
                                        }
                                }

                                if (!reject) {
                                        /* out? */
                                        if (itemtype == 0)
                                        //檢查選擇的osd tyep的item是否是out狀態
                                                reject = is_out(map, weight,
                                                                weight_max,
                                                                item, x);
                                        else
                                                reject = 0;
                                }
reject:                         //若沒有選中合適的item則進入如下處理,
                                if (reject || collide) {
                                        ftotal++;
                                        flocal++;

                                        if (collide && flocal <= local_retries)
                                                /* retry locally a few times */
                                                retry_bucket = 1;
                                        else if (local_fallback_retries > 0 &&
                                                 flocal <= in->size + local_fallback_retries)
                                                /* exhaustive bucket search */
                                                retry_bucket = 1;
                                        else if (ftotal < tries)
                                                /* then retry descent */
                                                retry_descent = 1;
                                        else
                                                /* else give up */
                                                skip_rep = 1;
                                        dprintk("  reject %d  collide %d  "
                                                "ftotal %u  flocal %u\n",
                                                reject, collide, ftotal,
                                                flocal);
                                }
                        } while (retry_bucket); //選中bucket,繼續選擇
                } while (retry_descent);

                if (skip_rep) {
                        dprintk("skip rep\n");
                        continue;
                }

                dprintk("CHOOSE got %d\n", item);
                out[outpos] = item;
                outpos++;
                count--;

                if (map->choose_tries && ftotal <= map->choose_total_tries)
                        map->choose_tries[ftotal]++;
        }                                                                                                                                                                                                         

        dprintk("CHOOSE returns %d\n", outpos);
        return outpos;
}

該函式簡單的說就是呼叫crush_bucket_choose(…)函式從指定的bucket中選擇合適的item,放入到out vector中,如果選中的item的型別不是期望的型別,且不是device,則基於當前的bucket繼續呼叫crush_bucket_choose(…);如果當前的step是chooseleaf_firstn,則遞迴呼叫crush_choose_firstn(…),遞迴呼叫選中的osd將臨時存放到out2 vector中,跳出遞迴後再複製給out。

注:recurse_tries引數表示遞迴嘗試choose leaf的次數,貌似在該實現中沒有使用。

crush_bucket_choose函式

static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
        BUG_ON(in->size == 0);
        switch (in->alg) {
        case CRUSH_BUCKET_UNIFORM:
                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
                                          x, r);
        case CRUSH_BUCKET_LIST:
                return bucket_list_choose((struct crush_bucket_list *)in,
                                          x, r);
        case CRUSH_BUCKET_TREE:
                return bucket_tree_choose((struct crush_bucket_tree *)in,
                                          x, r);
        case CRUSH_BUCKET_STRAW:
                return bucket_straw_choose((struct crush_bucket_straw *)in,
                                           x, r);
        case CRUSH_BUCKET_STRAW2:
                return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
                                            x, r);
        default:
                dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
                return in->items[0];
        }