]> git.itanic.dy.fi Git - linux-stable/blobdiff - fs/ext4/mballoc.c
ext4: use buckets for cr 1 block scan instead of rbtree
[linux-stable] / fs / ext4 / mballoc.c
index fa9f1993a00cd7e526aced449799ae228d4e9308..bf4ec1c065728e5353ea33cb0b229c0706dc9b15 100644 (file)
  *    number of buddy bitmap orders possible) number of lists. Group-infos are
  *    placed in appropriate lists.
  *
- * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
+ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
  *
- *    Locking: sbi->s_mb_rb_lock (rwlock)
+ *    Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
  *
- *    This is a red black tree consisting of group infos and the tree is sorted
- *    by average fragment sizes (which is calculated as ext4_group_info->bb_free
- *    / ext4_group_info->bb_fragments).
+ *    This is an array of lists where in the i-th list there are groups with
+ *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
+ *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
+ *    Note that we don't bother with a special list for completely empty groups
+ *    so we only have MB_NUM_ORDERS(sb) lists.
  *
  * When "mb_optimize_scan" mount option is set, mballoc consults the above data
  * structures to decide the order in which groups are to be traversed for
  *
  * At CR = 1, we only consider groups where average fragment size > request
  * size. So, we lookup a group which has average fragment size just above or
- * equal to request size using our rb tree (data structure 2) in O(log N) time.
+ * equal to request size using our average fragment size group lists (data
+ * structure 2) in O(1) time.
  *
  * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
  * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
 
-static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
-                       int (*cmp)(struct rb_node *, struct rb_node *))
+static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
 {
-       struct rb_node **iter = &root->rb_node, *parent = NULL;
+       int order;
 
-       while (*iter) {
-               parent = *iter;
-               if (cmp(new, *iter) > 0)
-                       iter = &((*iter)->rb_left);
-               else
-                       iter = &((*iter)->rb_right);
-       }
-
-       rb_link_node(new, parent, iter);
-       rb_insert_color(new, root);
-}
-
-static int
-ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
-{
-       struct ext4_group_info *grp1 = rb_entry(rb1,
-                                               struct ext4_group_info,
-                                               bb_avg_fragment_size_rb);
-       struct ext4_group_info *grp2 = rb_entry(rb2,
-                                               struct ext4_group_info,
-                                               bb_avg_fragment_size_rb);
-       int num_frags_1, num_frags_2;
-
-       num_frags_1 = grp1->bb_fragments ?
-               grp1->bb_free / grp1->bb_fragments : 0;
-       num_frags_2 = grp2->bb_fragments ?
-               grp2->bb_free / grp2->bb_fragments : 0;
-
-       return (num_frags_2 - num_frags_1);
+       /*
+        * We don't bother with a special lists groups with only 1 block free
+        * extents and for completely empty groups.
+        */
+       order = fls(len) - 2;
+       if (order < 0)
+               return 0;
+       if (order == MB_NUM_ORDERS(sb))
+               order--;
+       return order;
 }
 
-/*
- * Reinsert grpinfo into the avg_fragment_size tree with new average
- * fragment size.
- */
+/* Move group to appropriate avg_fragment_size list */
 static void
 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int new_order;
 
        if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
                return;
 
-       write_lock(&sbi->s_mb_rb_lock);
-       if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
-               rb_erase(&grp->bb_avg_fragment_size_rb,
-                               &sbi->s_mb_avg_fragment_size_root);
-               RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
-       }
+       new_order = mb_avg_fragment_size_order(sb,
+                                       grp->bb_free / grp->bb_fragments);
+       if (new_order == grp->bb_avg_fragment_size_order)
+               return;
 
-       ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
-               &grp->bb_avg_fragment_size_rb,
-               ext4_mb_avg_fragment_size_cmp);
-       write_unlock(&sbi->s_mb_rb_lock);
+       if (grp->bb_avg_fragment_size_order != -1) {
+               write_lock(&sbi->s_mb_avg_fragment_size_locks[
+                                       grp->bb_avg_fragment_size_order]);
+               list_del(&grp->bb_avg_fragment_size_node);
+               write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+                                       grp->bb_avg_fragment_size_order]);
+       }
+       grp->bb_avg_fragment_size_order = new_order;
+       write_lock(&sbi->s_mb_avg_fragment_size_locks[
+                                       grp->bb_avg_fragment_size_order]);
+       list_add_tail(&grp->bb_avg_fragment_size_node,
+               &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
+       write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+                                       grp->bb_avg_fragment_size_order]);
 }
 
 /*
@@ -909,86 +898,56 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
                *new_cr = 1;
        } else {
                *group = grp->bb_group;
-               ac->ac_last_optimal_group = *group;
                ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
        }
 }
 
 /*
- * Choose next group by traversing average fragment size tree. Updates *new_cr
- * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
- * the linear search should continue for one iteration since there's lock
- * contention on the rb tree lock.
+ * Choose next group by traversing average fragment size list of suitable
+ * order. Updates *new_cr if cr level needs an update.
  */
 static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
                int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
 {
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-       int avg_fragment_size, best_so_far;
-       struct rb_node *node, *found;
-       struct ext4_group_info *grp;
-
-       /*
-        * If there is contention on the lock, instead of waiting for the lock
-        * to become available, just continue searching lineraly. We'll resume
-        * our rb tree search later starting at ac->ac_last_optimal_group.
-        */
-       if (!read_trylock(&sbi->s_mb_rb_lock)) {
-               ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
-               return;
-       }
+       struct ext4_group_info *grp, *iter;
+       int i;
 
        if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
                if (sbi->s_mb_stats)
                        atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
-               /* We have found something at CR 1 in the past */
-               grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
-               for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
-                    found = rb_next(found)) {
-                       grp = rb_entry(found, struct ext4_group_info,
-                                      bb_avg_fragment_size_rb);
+       }
+
+       for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+            i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+               if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
+                       continue;
+               read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
+               if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
+                       read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+                       continue;
+               }
+               grp = NULL;
+               list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
+                                   bb_avg_fragment_size_node) {
                        if (sbi->s_mb_stats)
                                atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
-                       if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
+                       if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
+                               grp = iter;
                                break;
-               }
-               goto done;
-       }
-
-       node = sbi->s_mb_avg_fragment_size_root.rb_node;
-       best_so_far = 0;
-       found = NULL;
-
-       while (node) {
-               grp = rb_entry(node, struct ext4_group_info,
-                              bb_avg_fragment_size_rb);
-               avg_fragment_size = 0;
-               if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
-                       avg_fragment_size = grp->bb_fragments ?
-                               grp->bb_free / grp->bb_fragments : 0;
-                       if (!best_so_far || avg_fragment_size < best_so_far) {
-                               best_so_far = avg_fragment_size;
-                               found = node;
                        }
                }
-               if (avg_fragment_size > ac->ac_g_ex.fe_len)
-                       node = node->rb_right;
-               else
-                       node = node->rb_left;
+               read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+               if (grp)
+                       break;
        }
 
-done:
-       if (found) {
-               grp = rb_entry(found, struct ext4_group_info,
-                              bb_avg_fragment_size_rb);
+       if (grp) {
                *group = grp->bb_group;
                ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
        } else {
                *new_cr = 2;
        }
-
-       read_unlock(&sbi->s_mb_rb_lock);
-       ac->ac_last_optimal_group = *group;
 }
 
 static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1017,11 +976,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
                goto inc_and_return;
        }
 
-       if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
-               ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
-               goto inc_and_return;
-       }
-
        return group;
 inc_and_return:
        /*
@@ -1152,13 +1106,13 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
        }
        mb_set_largest_free_order(sb, grp);
+       mb_update_avg_fragment_size(sb, grp);
 
        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
        period = get_cycles() - period;
        atomic_inc(&sbi->s_mb_buddies_generated);
        atomic64_add(period, &sbi->s_mb_generation_time);
-       mb_update_avg_fragment_size(sb, grp);
 }
 
 /* The buddy information is attached the buddy cache inode
@@ -2705,7 +2659,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                 * from the goal value specified
                 */
                group = ac->ac_g_ex.fe_group;
-               ac->ac_last_optimal_group = group;
                ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
                prefetch_grp = group;
 
@@ -2987,9 +2940,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
        struct super_block *sb = pde_data(file_inode(seq->file));
        unsigned long position;
 
-       read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
-
-       if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+       if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
                return NULL;
        position = *pos + 1;
        return (void *) ((unsigned long) position);
@@ -3001,7 +2952,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof
        unsigned long position;
 
        ++*pos;
-       if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+       if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
                return NULL;
        position = *pos + 1;
        return (void *) ((unsigned long) position);
@@ -3013,29 +2964,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long position = ((unsigned long) v);
        struct ext4_group_info *grp;
-       struct rb_node *n;
-       unsigned int count, min, max;
+       unsigned int count;
 
        position--;
        if (position >= MB_NUM_ORDERS(sb)) {
-               seq_puts(seq, "fragment_size_tree:\n");
-               n = rb_first(&sbi->s_mb_avg_fragment_size_root);
-               if (!n) {
-                       seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
-                       return 0;
-               }
-               grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
-               min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
-               count = 1;
-               while (rb_next(n)) {
-                       count++;
-                       n = rb_next(n);
-               }
-               grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
-               max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+               position -= MB_NUM_ORDERS(sb);
+               if (position == 0)
+                       seq_puts(seq, "avg_fragment_size_lists:\n");
 
-               seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
-                          min, max, count);
+               count = 0;
+               read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
+               list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
+                                   bb_avg_fragment_size_node)
+                       count++;
+               read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
+               seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+                                       (unsigned int)position, count);
                return 0;
        }
 
@@ -3045,9 +2989,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
                seq_puts(seq, "max_free_order_lists:\n");
        }
        count = 0;
+       read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
        list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
                            bb_largest_free_order_node)
                count++;
+       read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
        seq_printf(seq, "\tlist_order_%u_groups: %u\n",
                   (unsigned int)position, count);
 
@@ -3055,11 +3001,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
 }
 
 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
-__releases(&EXT4_SB(sb)->s_mb_rb_lock)
 {
-       struct super_block *sb = pde_data(file_inode(seq->file));
-
-       read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
 }
 
 const struct seq_operations ext4_mb_seq_structs_summary_ops = {
@@ -3172,8 +3114,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
        INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
-       RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
+       INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
+       meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
        meta_group_info[i]->bb_group = group;
 
        mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
@@ -3422,7 +3365,24 @@ int ext4_mb_init(struct super_block *sb)
                i++;
        } while (i < MB_NUM_ORDERS(sb));
 
-       sbi->s_mb_avg_fragment_size_root = RB_ROOT;
+       sbi->s_mb_avg_fragment_size =
+               kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+                       GFP_KERNEL);
+       if (!sbi->s_mb_avg_fragment_size) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       sbi->s_mb_avg_fragment_size_locks =
+               kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
+                       GFP_KERNEL);
+       if (!sbi->s_mb_avg_fragment_size_locks) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
+               INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
+               rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
+       }
        sbi->s_mb_largest_free_orders =
                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
                        GFP_KERNEL);
@@ -3441,7 +3401,6 @@ int ext4_mb_init(struct super_block *sb)
                INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
                rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
        }
-       rwlock_init(&sbi->s_mb_rb_lock);
 
        spin_lock_init(&sbi->s_md_lock);
        sbi->s_mb_free_pending = 0;
@@ -3512,6 +3471,8 @@ int ext4_mb_init(struct super_block *sb)
        free_percpu(sbi->s_locality_groups);
        sbi->s_locality_groups = NULL;
 out:
+       kfree(sbi->s_mb_avg_fragment_size);
+       kfree(sbi->s_mb_avg_fragment_size_locks);
        kfree(sbi->s_mb_largest_free_orders);
        kfree(sbi->s_mb_largest_free_orders_locks);
        kfree(sbi->s_mb_offsets);
@@ -3578,6 +3539,8 @@ int ext4_mb_release(struct super_block *sb)
                kvfree(group_info);
                rcu_read_unlock();
        }
+       kfree(sbi->s_mb_avg_fragment_size);
+       kfree(sbi->s_mb_avg_fragment_size_locks);
        kfree(sbi->s_mb_largest_free_orders);
        kfree(sbi->s_mb_largest_free_orders_locks);
        kfree(sbi->s_mb_offsets);