Index: linux-2.6.11/fs/ext3/mballoc.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/mballoc.c	2005-03-02 22:42:20.659360368 +0300
+++ linux-2.6.11/fs/ext3/mballoc.c	2005-03-05 19:36:06.000000000 +0300
@@ -0,0 +1,1862 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+
+#include <linux/config.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+
+/*
+ * TODO:
+ *   - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
+ *   - track min/max extents in each group for better group selection
+ *   - is it worthwhile to use buddies directly if req is 2^N blocks?
+ *   - mb_mark_used() may allocate chunk right after splitting buddy
+ *   - special flag to advice allocator to look for requested + N blocks
+ *     this may improve interaction between extents and mballoc
+ *   - tree of groups sorted by number of free blocks
+ *   - percpu reservation code (hotpath)
+ *   - error handling
+ */
+
+/*
+ * with AGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with MBALLOC_STATS allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MBALLOC_STATS
+
+/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define mb_debug(fmt,a...)	printk(fmt, ##a)
+#else
+#define mb_debug(fmt,a...)
+#endif
+
+/*
+ * where to save buddies structures beetween umount/mount (clean case only)
+ */
+#define EXT3_BUDDY_FILE		".buddy"
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define EXT3_MB_MAX_TO_SCAN	100
+
+/*
+ * This structure is on-disk description of a group for mballoc
+ */
+struct ext3_mb_group_descr {
+	__u16	mgd_first_free;		/* first free block in the group */
+	__u16	mgd_free;		/* number of free blocks in the group */
+	__u16	mgd_counters[16];	/* number of free blocks by order */
+};
+
+/*
+ * This structure is header of mballoc's file
+ */
+struct ext3_mb_grp_header {
+	__u32	mh_magic;
+};
+
+#define EXT3_MB_MAGIC_V1	0xbabd16fd
+
+
+struct ext3_free_extent {
+	__u16 fe_start;
+	__u16 fe_len;
+	__u16 fe_group;
+};
+
+struct ext3_allocation_context {
+	struct super_block *ac_sb;
+
+	/* search goals */
+struct ext3_free_extent ac_g_ex;
+	
+	/* the best found extent */
+	struct ext3_free_extent ac_b_ex;
+	
+	/* number of iterations done. we have to track to limit searching */
+	unsigned long ac_ex_scanned;
+	__u16 ac_groups_scanned;
+	__u16 ac_found;
+	__u8 ac_status;	
+	__u8 ac_flags;		/* allocation hints */
+	__u8 ac_repeats;
+};
+
+#define AC_STATUS_CONTINUE	1
+#define AC_STATUS_FOUND		2
+#define AC_STATUS_BREAK		3
+
+struct ext3_buddy {
+	struct buffer_head *bd_bh;
+	struct buffer_head *bd_bh2;
+	struct ext3_buddy_group_blocks *bd_bd;
+	struct super_block *bd_sb;
+	__u16 bd_blkbits;
+	__u16 bd_group;
+};
+#define EXT3_MB_BITMAP(e3b)	((e3b)->bd_bh->b_data)
+#define EXT3_MB_BUDDY(e3b)	((e3b)->bd_bh2->b_data)
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+int ext3_mb_reserve_blocks(struct super_block *, int);
+void ext3_mb_release_blocks(struct super_block *, int);
+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
+void ext3_mb_free_committed_blocks(struct super_block *);
+
+#if BITS_PER_LONG == 64
+#define mb_correct_addr_and_bit(bit,addr)		\
+{							\
+	bit += ((unsigned long) addr & 7UL) << 3;	\
+	addr = (void *) ((unsigned long) addr & ~7UL);	\
+}
+#elif BITS_PER_LONG == 32
+#define mb_correct_addr_and_bit(bit,addr)		\
+{							\
+	bit += ((unsigned long) addr & 3UL) << 3;	\
+	addr = (void *) ((unsigned long) addr & ~3UL);	\
+}
+#else
+#error "how many bits you are?!"
+#endif
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit,addr);
+	return ext2_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit,addr);
+	ext2_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit,addr);
+	ext2_set_bit_atomic(NULL, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit,addr);
+	ext2_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit,addr);
+	ext2_clear_bit_atomic(NULL, bit, addr);
+}
+
+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
+	int i = 1;
+	char *bb;
+
+	J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+	J_ASSERT(max != NULL);
+
+	if (order > e3b->bd_blkbits + 1) {
+		*max = 0;
+		return NULL;
+	}
+
+	/* at order 0 we see each particular block */
+	*max = 1 << (e3b->bd_blkbits + 3);
+	if (order == 0)
+		return EXT3_MB_BITMAP(e3b);
+
+	bb = EXT3_MB_BUDDY(e3b);
+	*max = *max >> 1;
+	while (i < order) {
+		bb += 1 << (e3b->bd_blkbits - i);
+		i++;
+		*max = *max >> 1;
+	}
+	J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
+			e3b->bd_sb->s_blocksize);
+	return bb;
+}
+
+static int ext3_mb_load_buddy(struct super_block *sb, int group,
+				struct ext3_buddy *e3b)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap);
+	J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy);
+
+	/* load bitmap */
+	e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
+	if (e3b->bd_bh == NULL) {
+		ext3_error(sb, "ext3_mb_load_buddy",
+				"can't get block for buddy bitmap\n");
+		goto out;
+	}
+	/* load buddy */
+	e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
+	if (e3b->bd_bh2 == NULL) {
+		ext3_error(sb, "ext3_mb_load_buddy",
+				"can't get block for buddy bitmap\n");
+		goto out;
+	}
+
+	if (!buffer_uptodate(e3b->bd_bh))
+		ll_rw_block(READ, 1, &e3b->bd_bh);
+	if (!buffer_uptodate(e3b->bd_bh2))
+		ll_rw_block(READ, 1, &e3b->bd_bh2);
+
+	wait_on_buffer(e3b->bd_bh);
+	J_ASSERT(buffer_uptodate(e3b->bd_bh));
+	wait_on_buffer(e3b->bd_bh2);
+	J_ASSERT(buffer_uptodate(e3b->bd_bh2));
+
+	e3b->bd_blkbits = sb->s_blocksize_bits;
+	e3b->bd_bd = sbi->s_buddy_blocks[group];
+	e3b->bd_sb = sb;
+	e3b->bd_group = group;
+
+	return 0;
+out:
+	brelse(e3b->bd_bh);
+	brelse(e3b->bd_bh2);
+	e3b->bd_bh = NULL;
+	e3b->bd_bh2 = NULL;
+	return -EIO;
+}
+
+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
+{
+	mark_buffer_dirty(e3b->bd_bh);
+	mark_buffer_dirty(e3b->bd_bh2);
+}
+
+static void ext3_mb_release_desc(struct ext3_buddy *e3b)
+{
+	brelse(e3b->bd_bh);
+	brelse(e3b->bd_bh2);
+}
+
+#ifdef AGGRESSIVE_CHECK
+static void mb_check_buddy(struct ext3_buddy *e3b)
+{
+	int order = e3b->bd_blkbits + 1;
+	int max, max2, i, j, k, count;
+	void *buddy, *buddy2;
+
+	if (!test_opt(e3b->bd_sb, MBALLOC))
+		return;
+
+	while (order > 1) {
+		buddy = mb_find_buddy(e3b, order, &max);
+		J_ASSERT(buddy);
+		buddy2 = mb_find_buddy(e3b, order - 1, &max2);
+		J_ASSERT(buddy2);
+		J_ASSERT(buddy != buddy2);
+		J_ASSERT(max * 2 == max2);
+
+		count = 0;
+		for (i = 0; i < max; i++) {
+
+			if (mb_test_bit(i, buddy)) {
+				/* only single bit in buddy2 may be 1 */
+				if (!mb_test_bit(i << 1, buddy2))
+					J_ASSERT(mb_test_bit((i<<1)+1, buddy2));
+				else if (!mb_test_bit((i << 1) + 1, buddy2))
+					J_ASSERT(mb_test_bit(i << 1, buddy2));
+				continue;
+			}
+
+			/* both bits in buddy2 must be 0 */
+			J_ASSERT(mb_test_bit(i << 1, buddy2));
+			J_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+			for (j = 0; j < (1 << order); j++) {
+				k = (i * (1 << order)) + j;
+				J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
+			}
+			count++;
+		}
+		J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
+		order--;
+	}
+
+	buddy = mb_find_buddy(e3b, 0, &max);
+	for (i = 0; i < max; i++) {
+		if (!mb_test_bit(i, buddy))
+			continue;
+		/* check used bits only */
+		for (j = 0; j < e3b->bd_blkbits + 1; j++) {
+			buddy2 = mb_find_buddy(e3b, j, &max2);
+			k = i >> j;
+			J_ASSERT(k < max2);
+			J_ASSERT(mb_test_bit(k, buddy2));
+		}
+	}
+}
+#else
+#define mb_check_buddy(e3b)
+#endif
+
+static inline void
+ext3_lock_group(struct super_block *sb, int group)
+{
+	spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+	spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+{
+	int order = 1;
+	void *bb;
+
+	J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+	J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
+
+	bb = EXT3_MB_BUDDY(e3b);
+	while (order <= e3b->bd_blkbits + 1) {
+		block = block >> 1;
+		if (!mb_test_bit(block, bb)) {
+			/* this block is part of buddy of order 'order' */
+			return order;
+		}
+		bb += 1 << (e3b->bd_blkbits - order);
+		order++;
+	}
+	return 0;
+}
+
+static inline void mb_clear_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		mb_clear_bit_atomic(cur, bm);
+		cur++;
+	}
+}
+
+static inline void mb_set_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0xffffffff;
+			cur += 32;
+			continue;
+		}
+		mb_set_bit_atomic(cur, bm);
+		cur++;
+	}
+}
+
+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
+{
+	int block, max, order;
+	void *buddy, *buddy2;
+
+	mb_check_buddy(e3b);
+
+	e3b->bd_bd->bb_free += count;
+	if (first < e3b->bd_bd->bb_first_free)
+		e3b->bd_bd->bb_first_free = first;
+
+	while (count-- > 0) {
+		block = first++;
+		order = 0;
+
+		J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
+		mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
+		e3b->bd_bd->bb_counters[order]++;
+
+		/* start of the buddy */
+		buddy = mb_find_buddy(e3b, order, &max);
+
+		do {
+			block &= ~1UL;
+			if (mb_test_bit(block, buddy) ||
+					mb_test_bit(block + 1, buddy))
+				break;
+
+			/* both the buddies are free, try to coalesce them */
+			buddy2 = mb_find_buddy(e3b, order + 1, &max);
+
+			if (!buddy2)
+				break;
+
+			if (order > 0) {
+				/* for special purposes, we don't set
+				 * free bits in bitmap */
+				mb_set_bit(block, buddy);
+				mb_set_bit(block + 1, buddy);
+			}
+			e3b->bd_bd->bb_counters[order]--;
+			e3b->bd_bd->bb_counters[order]--;
+
+			block = block >> 1;
+			order++;
+			e3b->bd_bd->bb_counters[order]++;
+
+			mb_clear_bit(block, buddy2);
+			buddy = buddy2;
+		} while (1);
+	}
+	mb_check_buddy(e3b);
+
+	return 0;
+}
+
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
+				int needed, struct ext3_free_extent *ex)
+{
+	int next, max, ord;
+	void *buddy;
+
+	J_ASSERT(ex != NULL);
+
+	buddy = mb_find_buddy(e3b, order, &max);
+	J_ASSERT(buddy);
+	J_ASSERT(block < max);
+	if (mb_test_bit(block, buddy)) {
+		ex->fe_len = 0;
+		ex->fe_start = 0;
+		ex->fe_group = 0;
+		return 0;
+	}
+
+	if (order == 0) {
+		/* find actual order */
+		order = mb_find_order_for_block(e3b, block);
+		block = block >> order;
+	}
+
+	ex->fe_len = 1 << order;
+	ex->fe_start = block << order;
+	ex->fe_group = e3b->bd_group;
+
+	while ((buddy = mb_find_buddy(e3b, order, &max))) {
+
+		if (block + 1 >= max)
+			break;
+
+		next = (block + 1) * (1 << order);
+		if (mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
+			break;
+
+		ord = mb_find_order_for_block(e3b, next);
+
+		order = ord;
+		block = next >> order;
+		ex->fe_len += 1 << order;
+	}
+
+	J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
+	return ex->fe_len;
+}
+
+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
+	int start = ex->fe_start;
+	int len = ex->fe_len;
+	int ord, mlen, max, cur;
+	int len0 = len;
+	void *buddy;
+
+	e3b->bd_bd->bb_free -= len;
+	if (e3b->bd_bd->bb_first_free == start)
+		e3b->bd_bd->bb_first_free += len;
+
+	while (len) {
+		ord = mb_find_order_for_block(e3b, start);
+
+		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+			/* the whole chunk may be allocated at once! */
+			mlen = 1 << ord;
+			buddy = mb_find_buddy(e3b, ord, &max);
+			J_ASSERT((start >> ord) < max);
+			mb_set_bit(start >> ord, buddy);
+			e3b->bd_bd->bb_counters[ord]--;
+			start += mlen;
+			len -= mlen;
+			J_ASSERT(len >= 0);
+			continue;
+		}
+
+		/* we have to split large buddy */
+		J_ASSERT(ord > 0);
+		buddy = mb_find_buddy(e3b, ord, &max);
+		mb_set_bit(start >> ord, buddy);
+		e3b->bd_bd->bb_counters[ord]--;
+
+		ord--;
+		cur = (start >> ord) & ~1U;
+		buddy = mb_find_buddy(e3b, ord, &max);
+		mb_clear_bit(cur, buddy);
+		mb_clear_bit(cur + 1, buddy);
+		e3b->bd_bd->bb_counters[ord]++;
+		e3b->bd_bd->bb_counters[ord]++;
+	}
+
+	/* now drop all the bits in bitmap */
+	mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
+
+	mb_check_buddy(e3b);
+
+	return 0;
+}
+
+/*
+ * Must be called under group lock!
+ */
+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
+					struct ext3_buddy *e3b)
+{
+	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+	mb_mark_used(e3b, &ac->ac_b_ex);
+	ac->ac_status = AC_STATUS_FOUND;
+}
+
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac,
+					struct ext3_free_extent *ex,
+					struct ext3_buddy *e3b)
+{
+	int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
+	struct ext3_free_extent *bex = &ac->ac_b_ex;
+	int diff = ac->ac_g_ex.fe_len - ex->fe_len;
+
+	J_ASSERT(ex->fe_len > 0);
+	J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+	J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+
+	ac->ac_found++;
+
+	/*
+	 * The special case - take what you catch first
+	 */
+	if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
+		*bex = *ex;
+		ext3_mb_use_best_found(ac, e3b);
+		return;
+	}
+
+	/*
+	 * Let's check whether the chuck is good enough
+	 */
+	if (ex->fe_len >= ac->ac_g_ex.fe_len) {
+		*bex = *ex;
+		ext3_mb_use_best_found(ac, e3b);
+		return;
+	}
+
+	/*
+	 * If the request is vey large, then it makes sense to use large
+	 * chunks for it. Even if they don't satisfy whole request.
+	 */
+	if (ex->fe_len > 1000) {
+		*bex = *ex;
+		ext3_mb_use_best_found(ac, e3b);
+		return;
+	}
+
+	/*
+	 * Sometimes it's worty to take close chunk
+	 */
+	if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
+		*bex = *ex;
+		ext3_mb_use_best_found(ac, e3b);
+		return;
+	}
+
+	/*
+	 * If this is first found extent, just store it in the context
+	 */
+	if (bex->fe_len == 0) {
+		*bex = *ex;
+		return;
+	}
+
+	/*
+	 * If new found extent is better, store it in the context
+	 * FIXME: possible the policy should be more complex?
+	 */
+	if (ex->fe_len > bex->fe_len) {
+		*bex = *ex;
+	}
+
+	/*
+	 * We don't want to scan for a whole year
+	 */
+	if (ac->ac_found > EXT3_MB_MAX_TO_SCAN)
+		ac->ac_status = AC_STATUS_BREAK;
+}
+
+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac,
+					struct ext3_buddy *e3b)
+{
+	struct ext3_free_extent ex = ac->ac_b_ex;
+	int group = ex.fe_group, max, err;
+
+	J_ASSERT(ex.fe_len > 0);
+	err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+	if (err)
+		return err;
+
+	ext3_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
+	
+	if (max > 0)
+		ext3_mb_use_best_found(ac, e3b);
+
+	ext3_unlock_group(ac->ac_sb, group);
+
+	if (ac->ac_status == AC_STATUS_FOUND)
+		ext3_mb_dirty_buddy(e3b);
+	ext3_mb_release_desc(e3b);
+
+	return 0;
+}
+
+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac,
+				struct ext3_buddy *e3b)
+{
+	int group = ac->ac_g_ex.fe_group, max, err;
+	struct ext3_free_extent ex;
+
+	err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+	if (err)
+		return err;
+
+	ext3_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
+				ac->ac_g_ex.fe_len, &ex);
+	
+	if (max > 0) {
+		J_ASSERT(ex.fe_len > 0);
+		J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+		J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+		ac->ac_b_ex = ex;
+		ext3_mb_use_best_found(ac, e3b);
+	}
+	ext3_unlock_group(ac->ac_sb, group);
+
+	if (ac->ac_status == AC_STATUS_FOUND)
+		ext3_mb_dirty_buddy(e3b);
+	ext3_mb_release_desc(e3b);
+
+	return 0;
+}
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can upper limit.
+ */
+static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
+				struct ext3_buddy *e3b)
+{
+	struct super_block *sb = ac->ac_sb;
+	void *bitmap = EXT3_MB_BITMAP(e3b);
+	struct ext3_free_extent ex;
+	int i, free;
+
+	free = e3b->bd_bd->bb_free;
+	J_ASSERT(free > 0);
+
+	i = e3b->bd_bd->bb_first_free;
+
+	while (free && ac->ac_status != AC_STATUS_FOUND) {
+		i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
+		if (i >= sb->s_blocksize * 8) {
+			J_ASSERT(free == 0);
+			break;
+		}
+
+		mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex);
+		J_ASSERT(ex.fe_len > 0);
+		J_ASSERT(free >= ex.fe_len);
+
+		ext3_mb_measure_extent(ac, &ex, e3b);
+
+		i += ex.fe_len;
+		free -= ex.fe_len;
+	}
+}
+
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+				int group, int cr)
+{
+	int free;
+
+	J_ASSERT(cr >= 0 && cr < 3);
+
+	free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
+	if (free == 0)
+		return 0;
+
+	if (cr == 0) {
+		if (free >= ac->ac_g_ex.fe_len >> 1)
+			return 1;
+	} else if (cr == 1) {
+		if (free >= ac->ac_g_ex.fe_len >> 2)
+			return 1;
+	} else if (cr == 2) {
+		return 1;
+	}
+	return 0;
+}
+
+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
+		       unsigned long goal, int *len, int flags, int *errp)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext3_allocation_context ac;
+	int i, group, block, cr, err = 0;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es;
+	struct buffer_head *gdp_bh;
+	struct ext3_sb_info *sbi;
+	struct super_block *sb;
+	struct ext3_buddy e3b;
+
+	J_ASSERT(len != NULL);
+	J_ASSERT(*len > 0);
+
+	sb = inode->i_sb;
+	if (!sb) {
+		printk("ext3_mb_new_nblocks: nonexistent device");
+		return 0;
+	}
+
+	if (!test_opt(sb, MBALLOC)) {
+		static int ext3_mballoc_warning = 0;
+		if (ext3_mballoc_warning == 0) {
+			printk(KERN_ERR "EXT3-fs: multiblock request with "
+				"mballoc disabled!\n");
+			ext3_mballoc_warning++;
+		}
+		*len = 1;
+		err = ext3_new_block_old(handle, inode, goal, errp);
+		return err;
+	}
+
+	ext3_mb_poll_new_transaction(sb, handle);
+
+	sbi = EXT3_SB(sb);
+	es = EXT3_SB(sb)->s_es;
+
+	/*
+	 * We can't allocate > group size
+	 */
+	if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10)
+		*len = EXT3_BLOCKS_PER_GROUP(sb) - 10;
+
+	if (!(flags & EXT3_MB_HINT_RESERVED)) {
+		/* someone asks for non-reserved blocks */
+		BUG_ON(*len > 1);
+		err = ext3_mb_reserve_blocks(sb, 1);
+		if (err) {
+			*errp = err;
+			return 0;
+		}
+	}
+
+	/*
+	 * Check quota for allocation of this blocks.
+	 */
+	while (*len && DQUOT_ALLOC_BLOCK(inode, *len))
+		*len -= 1;
+	if (*len == 0) {
+		*errp = -EDQUOT;
+		block = 0;
+		goto out;
+	}
+
+	/* start searching from the goal */
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= le32_to_cpu(es->s_blocks_count))
+		goal = le32_to_cpu(es->s_first_data_block);
+	group = (goal - le32_to_cpu(es->s_first_data_block)) /
+			EXT3_BLOCKS_PER_GROUP(sb);
+	block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+			EXT3_BLOCKS_PER_GROUP(sb));
+
+	/* set up allocation goals */
+	ac.ac_b_ex.fe_group = 0;
+	ac.ac_b_ex.fe_start = 0;
+	ac.ac_b_ex.fe_len = 0;
+	ac.ac_status = AC_STATUS_CONTINUE;
+	ac.ac_groups_scanned = 0;
+	ac.ac_ex_scanned = 0;
+	ac.ac_found = 0;
+	ac.ac_sb = inode->i_sb;
+	ac.ac_g_ex.fe_group = group;
+	ac.ac_g_ex.fe_start = block;
+	ac.ac_g_ex.fe_len = *len;
+	ac.ac_flags = flags;
+
+	/*
+	 * Sometimes, caller may want to merge even small number
+	 * of blocks to an existing extent
+	 */
+	if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
+		err = ext3_mb_find_by_goal(&ac, &e3b);
+		if (err)
+			goto out_err;
+		if (ac.ac_status == AC_STATUS_FOUND)
+			goto found;
+	}
+
+	/*
+	 * FIXME
+	 * If requested chunk is power of 2 length, we can try
+	 * to exploit buddy nature to speed allocation up
+	 */
+
+
+	/*
+	 * Let's just scan groups to find more-less suitable blocks
+	 */
+	cr = 0;
+repeat:
+	for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
+		for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+			if (group == EXT3_SB(sb)->s_groups_count)
+				group = 0;
+
+			/* check is group good for our criteries */
+			if (!ext3_mb_good_group(&ac, group, cr))
+				continue;
+
+			err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+			if (err)
+				goto out_err;
+
+			ext3_lock_group(sb, group);
+			if (!ext3_mb_good_group(&ac, group, cr)) {
+				/* someone did allocation from this group */
+				ext3_unlock_group(sb, group);
+				ext3_mb_release_desc(&e3b);
+				continue;
+			}
+
+			ext3_mb_scan_group(&ac, &e3b);
+			ext3_unlock_group(sb, group);
+
+			if (ac.ac_status == AC_STATUS_FOUND)
+				ext3_mb_dirty_buddy(&e3b);
+			ext3_mb_release_desc(&e3b);
+
+			if (err)
+				goto out_err;
+			if (ac.ac_status != AC_STATUS_CONTINUE)
+				break;
+		}
+	}
+
+	if (ac.ac_status == AC_STATUS_BREAK &&
+			!(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
+		/*
+		 * We've been searching too long. Let's try to allocate
+		 * the best chunk we've found so far
+		 */
+		printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n",
+				ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
+		ext3_mb_try_best_found(&ac, &e3b);
+		if (ac.ac_status != AC_STATUS_FOUND) {
+			/*
+			 * Someone more lucky has already allocated it.
+			 * The only thing we can do is just take first
+			 * found block(s)
+			 */
+			printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
+			ac.ac_b_ex.fe_group = 0;
+			ac.ac_b_ex.fe_start = 0;
+			ac.ac_b_ex.fe_len = 0;
+			ac.ac_status = AC_STATUS_CONTINUE;
+			ac.ac_flags |= EXT3_MB_HINT_FIRST;
+			cr = 2;
+			goto repeat;
+		}
+	}
+
+	if (ac.ac_status != AC_STATUS_FOUND) {
+		/*
+		 * We aren't lucky definitely
+		 */
+		DQUOT_FREE_BLOCK(inode, *len);
+		*errp = -ENOSPC;
+		block = 0;
+#if 1
+		printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
+			ac.ac_status, ac.ac_flags);
+		printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
+			ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
+			ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
+		printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
+			sbi->s_blocks_reserved, ac.ac_found);
+		printk("EXT3-fs: groups: ");
+		for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+			printk("%d: %d ", i,
+				sbi->s_buddy_blocks[i]->bb_free);
+		printk("\n");
+#endif
+		goto out;
+	}
+
+found:
+	J_ASSERT(ac.ac_b_ex.fe_len > 0);
+
+	/* good news - free block(s) have been found. now it's time
+	 * to mark block(s) in good old journaled bitmap */
+	block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
+			+ ac.ac_b_ex.fe_start
+			+ le32_to_cpu(es->s_first_data_block);
+
+	/* we made a desicion, now mark found blocks in good old
+	 * bitmap to be journaled */
+
+	ext3_debug("using block group %d(%d)\n",
+			ac.ac_b_group.group, gdp->bg_free_blocks_count);
+
+	bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group);
+	if (!bitmap_bh) {
+		*errp = -EIO;
+		goto out_err;
+	}
+
+	err = ext3_journal_get_write_access(handle, bitmap_bh);
+	if (err) {
+		*errp = err;
+		goto out_err;
+	}
+
+	gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh);
+	if (!gdp) {
+		*errp = -EIO;
+		goto out_err;
+	}
+	
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto out_err;
+
+	block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
+			+ ac.ac_b_ex.fe_start
+			+ le32_to_cpu(es->s_first_data_block);
+
+	if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+	    block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+	    in_range(block, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group))
+		ext3_error(sb, "ext3_new_block",
+			    "Allocating block in system zone - "
+			    "block = %u", block);
+#ifdef AGGRESSIVE_CHECK
+	for (i = 0; i < ac.ac_b_ex.fe_len; i++)
+		J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
+#endif
+	mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
+	spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+	gdp->bg_free_blocks_count =
+			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
+					- ac.ac_b_ex.fe_len);
+	spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+	percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len);
+
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (err)
+		goto out_err;
+	err = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (err)
+		goto out_err;
+
+	sb->s_dirt = 1;
+	*errp = 0;
+	brelse(bitmap_bh);
+
+	/* drop non-allocated, but dquote'd blocks */
+	J_ASSERT(*len >= ac.ac_b_ex.fe_len);
+	DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len);
+
+	*len = ac.ac_b_ex.fe_len;
+	J_ASSERT(*len > 0);
+	J_ASSERT(block != 0);
+	goto out;
+
+out_err:
+	/* if we've already allocated something, roll it back */
+	if (ac.ac_status == AC_STATUS_FOUND) {
+		/* FIXME: free blocks here */
+	}
+
+	DQUOT_FREE_BLOCK(inode, *len);
+	brelse(bitmap_bh);
+	*errp = err;
+	block = 0;
+out:
+	if (!(flags & EXT3_MB_HINT_RESERVED)) {
+		/* block wasn't reserved before and we reserved it
+		 * at the beginning of allocation. it doesn't matter
+		 * whether we allocated anything or we failed: time
+		 * to release reservation. NOTE: because I expect
+		 * any multiblock request from delayed allocation
+		 * path only, here is single block always */
+		ext3_mb_release_blocks(sb, 1);
+	}
+#ifdef MBALLOC_STATS
+	if (ac.ac_g_ex.fe_len > 1) {
+		spin_lock(&sbi->s_bal_lock);
+		sbi->s_bal_reqs++;
+		sbi->s_bal_allocated += *len;
+		if (*len >= ac.ac_g_ex.fe_len)
+			sbi->s_bal_success++;
+		sbi->s_bal_ex_scanned += ac.ac_found;
+		if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
+				ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
+			sbi->s_bal_goals++;
+		if (ac.ac_found > EXT3_MB_MAX_TO_SCAN)
+			sbi->s_bal_breaks++;
+		spin_unlock(&sbi->s_bal_lock);
+	}
+#endif
+	return block;
+}
+
+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
+				struct ext3_mb_group_descr **grp)
+{
+	struct super_block *sb = e3b->bd_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int descr_per_block, err, offset;
+	struct ext3_mb_grp_header *hdr;
+	unsigned long block;
+
+	descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
+				/ sizeof(struct ext3_mb_group_descr);
+	block = e3b->bd_group / descr_per_block;
+	*bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
+	if (*bh == NULL) {
+		printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
+				e3b->bd_group, err);
+		return err;
+	}
+
+	hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
+	if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
+		printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
+				e3b->bd_group);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
+	offset = e3b->bd_group % descr_per_block
+			* sizeof(struct ext3_mb_group_descr)
+			+ sizeof(struct ext3_mb_grp_header);
+	*grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
+
+	return 0;
+}
+
+int ext3_mb_load_descr(struct ext3_buddy *e3b)
+{
+	struct ext3_mb_group_descr *grp;
+	struct ext3_group_desc *gdp;
+	struct buffer_head *bh;
+	int err, i;
+
+	err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
+	if (err)
+		return err;
+	
+	e3b->bd_bd->bb_first_free = grp->mgd_first_free;
+	e3b->bd_bd->bb_free = grp->mgd_free;
+	for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
+		J_ASSERT(i < 16);
+		e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
+	}
+	brelse(bh);
+
+	/* additional checks against old group descriptor */
+	gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
+	if (!gdp)
+		return -EIO;
+	if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
+		printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
+			e3b->bd_group, e3b->bd_bd->bb_free,
+			le16_to_cpu(gdp->bg_free_blocks_count));
+		return -ENODATA;
+	}
+
+	return 0;
+}
+
+
+int ext3_mb_update_descr(struct ext3_buddy *e3b)
+{
+	struct ext3_mb_group_descr *grp;
+	struct ext3_group_desc *gdp;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int err, i;
+
+	/* additional checks against old group descriptor */
+	gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
+	if (!gdp)
+		return -EIO;
+	if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
+		printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
+			e3b->bd_group, e3b->bd_bd->bb_free,
+			le16_to_cpu(gdp->bg_free_blocks_count));
+		return -ENODATA;
+	}
+
+	err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
+	if (err)
+		return err;
+	
+	handle = ext3_journal_start_sb(e3b->bd_sb, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	err = ext3_journal_get_write_access(handle, bh);
+	if (err)
+		goto out;
+	grp->mgd_first_free = e3b->bd_bd->bb_first_free;
+	grp->mgd_free = e3b->bd_bd->bb_free;
+	for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
+		J_ASSERT(i < 16);
+		grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
+	}
+	err = ext3_journal_dirty_metadata(handle, bh);
+	if (err)
+		goto out;
+	err = 0;
+out:
+	brelse(bh);
+	if (handle)
+		ext3_journal_stop(handle);
+	return err;
+}
+
+int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
+{
+	struct super_block *sb = e3b->bd_sb;
+	struct buffer_head *bh;
+	int i, count = 0;
+
+	mb_debug("generate buddy for group %d\n", e3b->bd_group);
+	memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
+	memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
+
+	bh = read_block_bitmap(sb, e3b->bd_group);
+	if (bh == NULL)
+		return -EIO; 
+
+	/* mb_free_blocks will set real free */
+	e3b->bd_bd->bb_free = 0;
+	e3b->bd_bd->bb_first_free = 1 << 15;
+	/* 
+	 * if change bb_counters size, don't forget about 
+	 * ext3_mb_init_backend() -bzzz
+	 */
+	memset(e3b->bd_bd->bb_counters, 0,
+		sizeof(unsigned) * (sb->s_blocksize_bits + 2));
+
+	/* loop over the blocks, and create buddies for free ones */
+	for (i = 0; i < sb->s_blocksize * 8; i++) {
+		if (!mb_test_bit(i, (void *) bh->b_data)) {
+			mb_free_blocks(e3b, i, 1);
+			count++;
+		}
+	}
+	brelse(bh);
+	mb_check_buddy(e3b);
+	ext3_mb_dirty_buddy(e3b);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(ext3_mb_new_blocks);
+
+#define MB_CREDITS	\
+	(EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS +	\
+		2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
+
+int ext3_mb_init_backend(struct super_block *sb, int *created)
+{
+	int err, i, len, descr_per_block, buddy_offset, size;
+	struct inode *root = sb->s_root->d_inode;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_mb_grp_header *hdr;
+	struct buffer_head *bh = NULL;
+	unsigned long block;
+	struct dentry *db;
+	handle_t *handle;
+	tid_t target;
+
+	*created = 0;
+	len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
+	sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
+	if (sbi->s_buddy_blocks == NULL) {
+		printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
+		return -ENOMEM;
+	}
+	memset(sbi->s_buddy_blocks, 0, len);
+	sbi->s_buddy = NULL;
+
+	down(&root->i_sem);
+	len = strlen(EXT3_BUDDY_FILE);
+	db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
+	if (IS_ERR(db)) {
+		err = PTR_ERR(db);
+		printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
+		up(&root->i_sem);
+		goto out;
+	}
+
+	if (db->d_inode == NULL) {
+		err = ext3_create(root, db, S_IFREG, NULL);
+		if (err) {
+			printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
+			up(&root->i_sem);
+			goto out;
+		}
+		db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
+		*created = 1;
+		mb_debug("no buddy file, regenerate\n");
+	}
+	up(&root->i_sem);
+	sbi->s_buddy = igrab(db->d_inode);
+
+	/* calculate needed size */
+	descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
+				/ sizeof(struct ext3_mb_group_descr);
+	buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
+				 / descr_per_block;
+	len = sbi->s_groups_count * sb->s_blocksize * 2 +
+			buddy_offset * sb->s_blocksize;
+	if (len != i_size_read(sbi->s_buddy)) {
+		if (*created == 0)
+			printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
+				(unsigned) len, 
+				(unsigned) i_size_read(sbi->s_buddy));
+		*created = 1;
+	}
+
+	/* read/create mb group descriptors */
+	for (i = 0; i < buddy_offset; i++) {
+		handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
+		if (IS_ERR(handle)) {
+			printk(KERN_ERR "EXT3-fs: cant start transaction\n");
+			err = PTR_ERR(handle);
+			goto err_out;
+		}
+		
+		bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
+		if (bh == NULL) {
+			printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
+			goto err_out;
+		}
+		hdr = (struct ext3_mb_grp_header *) bh->b_data;
+		if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
+			err = ext3_journal_get_write_access(handle, bh);
+			if (err)
+				goto err_out;
+			if (*created == 0)
+				printk(KERN_ERR 
+					"EXT3-fs: invalid header 0x%x in %d,"
+					"regenerate\n", hdr->mh_magic, i);
+			*created = 1;
+			hdr->mh_magic = EXT3_MB_MAGIC_V1;
+			err = ext3_journal_dirty_metadata(handle, bh);
+			if (err)
+				goto err_out;
+		}
+		brelse(bh);
+		ext3_journal_stop(handle);
+	}
+
+	/* 
+	 * if change bb_counters size, don't forget about ext3_mb_generate_buddy()
+	 */
+	len = sizeof(struct ext3_buddy_group_blocks);
+	len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
+	for (i = 0; i < sbi->s_groups_count; i++) {
+
+		sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
+		if (sbi->s_buddy_blocks[i] == NULL) {
+			printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
+			err = -ENOMEM;
+			goto out2;
+		}
+		memset(sbi->s_buddy_blocks[i], 0, len);
+
+		handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
+		if (IS_ERR(handle)) {
+			printk(KERN_ERR "EXT3-fs: cant start transaction\n");
+			err = PTR_ERR(handle);
+			goto out2;
+		}
+		
+		/* allocate block for bitmap */
+		block = buddy_offset + i * 2;
+		bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
+		if (bh == NULL) {
+			printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
+			goto out2;
+		}
+		sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
+		brelse(bh);
+
+		/* allocate block for buddy */
+		block = buddy_offset + i * 2 + 1;
+		bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
+		if (bh == NULL) {
+			printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
+			goto out2;
+		}
+		sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
+		brelse(bh);
+
+		size = (block + 1) << sbi->s_buddy->i_blkbits;
+		if (size > sbi->s_buddy->i_size) {
+			*created = 1;
+			EXT3_I(sbi->s_buddy)->i_disksize = size;
+			i_size_write(sbi->s_buddy, size);
+			mark_inode_dirty(sbi->s_buddy);
+		}
+		ext3_journal_stop(handle);
+
+		spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
+		sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
+		sbi->s_buddy_blocks[i]->bb_tid = 0;
+	}
+
+	if (journal_start_commit(sbi->s_journal, &target))
+		log_wait_commit(sbi->s_journal, target);
+
+out2:
+	dput(db);
+out:
+	return err;
+
+err_out:
+	return err;
+}
+
+int ext3_mb_write_descriptors(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_buddy e3b;
+	int ret = 0, i, err;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		if (sbi->s_buddy_blocks[i] == NULL)
+			continue;
+
+		err = ext3_mb_load_buddy(sb, i, &e3b);
+		if (err == 0) {
+			ext3_mb_update_descr(&e3b);
+			ext3_mb_release_desc(&e3b);
+		} else
+			ret = err;
+	}
+	return ret;
+}
+
+int ext3_mb_release(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int i;
+	
+	if (!test_opt(sb, MBALLOC))
+		return 0;
+
+	/* release freed, non-committed blocks */
+	spin_lock(&sbi->s_md_lock);
+	list_splice_init(&sbi->s_closed_transaction,
+			&sbi->s_committed_transaction);
+	list_splice_init(&sbi->s_active_transaction,
+			&sbi->s_committed_transaction);
+	spin_unlock(&sbi->s_md_lock);
+	ext3_mb_free_committed_blocks(sb);
+
+	if (sbi->s_buddy_blocks) {
+		ext3_mb_write_descriptors(sb);
+		for (i = 0; i < sbi->s_groups_count; i++) {
+			if (sbi->s_buddy_blocks[i] == NULL)
+				continue;
+			kfree(sbi->s_buddy_blocks[i]);
+		}
+		kfree(sbi->s_buddy_blocks);
+	}
+	if (sbi->s_buddy)
+		iput(sbi->s_buddy);
+	if (sbi->s_blocks_reserved)
+		printk("ext3-fs: %ld blocks being reserved at umount!\n",
+				sbi->s_blocks_reserved);
+#ifdef MBALLOC_STATS
+	printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n",
+		sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success);
+	printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n",
+		sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks);
+#endif
+	return 0;
+}
+
+int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
+	struct ext3_buddy e3b;
+	int i, err, created;
+
+	if (!test_opt(sb, MBALLOC))
+		return 0;
+
+	/* init file for buddy data */
+	clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
+	if ((err = ext3_mb_init_backend(sb, &created)))
+		return err;
+
+repeat:
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		err = ext3_mb_load_buddy(sb, i, &e3b);
+		if (err) {
+			/* FIXME: release backend */
+			return err;
+		}
+		if (created || needs_recovery)
+			ext3_mb_generate_buddy(&e3b);
+		else
+			err = ext3_mb_load_descr(&e3b);
+		ext3_mb_release_desc(&e3b);
+		if (err == -ENODATA) {
+			created = 1;
+			goto repeat;
+		}
+	}
+	if (created || needs_recovery)
+		printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
+				EXT3_SB(sb)->s_groups_count);
+	spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
+	spin_lock_init(&EXT3_SB(sb)->s_md_lock);
+	INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
+	INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
+	INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
+	set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
+
+#ifdef MBALLOC_STATS
+	spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
+#define	MBALLOC_INFO	" (stats)"
+#else
+#define	MBALLOC_INFO	""
+#endif
+	printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO);
+	return 0;
+}
+
+void ext3_mb_free_committed_blocks(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int err, i, count = 0, count2 = 0;
+	struct ext3_free_metadata *md;
+	struct ext3_buddy e3b;
+
+	if (list_empty(&sbi->s_committed_transaction))
+		return;
+
+	/* there is committed blocks to be freed yet */
+	do {
+		/* get next array of blocks */
+		md = NULL;
+		spin_lock(&sbi->s_md_lock);
+		if (!list_empty(&sbi->s_committed_transaction)) {
+			md = list_entry(sbi->s_committed_transaction.next,
+					struct ext3_free_metadata, list);
+			list_del(&md->list);
+		}
+		spin_unlock(&sbi->s_md_lock);
+
+		if (md == NULL)
+			break;
+
+		mb_debug("gonna free %u blocks in group %u (0x%p):",
+				md->num, md->group, md);
+
+		err = ext3_mb_load_buddy(sb, md->group, &e3b);
+		BUG_ON(err != 0);
+
+		/* there are blocks to put in buddy to make them really free */
+		count += md->num;
+		count2++;
+		ext3_lock_group(sb, md->group);
+		for (i = 0; i < md->num; i++) {
+			mb_debug(" %u", md->blocks[i]);
+			mb_free_blocks(&e3b, md->blocks[i], 1);
+		}
+		mb_debug("\n");
+		ext3_unlock_group(sb, md->group);
+
+		kfree(md);
+		ext3_mb_dirty_buddy(&e3b);
+		ext3_mb_release_desc(&e3b);
+
+	} while (md);
+	mb_debug("freed %u blocks in %u structures\n", count, count2);
+}
+
+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sbi->s_last_transaction == handle->h_transaction->t_tid)
+		return;
+
+	/* new transaction! time to close last one and free blocks for
+	 * committed transaction. we know that only transaction can be
+	 * active, so previos transaction can be being logged and we
+	 * know that transaction before previous is known to be alreade
+	 * logged. this means that now we may free blocks freed in all
+	 * transactions before previous one. hope I'm clear enough ... */
+
+	spin_lock(&sbi->s_md_lock);
+	if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
+		mb_debug("new transaction %lu, old %lu\n",
+				(unsigned long) handle->h_transaction->t_tid,
+				(unsigned long) sbi->s_last_transaction);
+		list_splice_init(&sbi->s_closed_transaction,
+					&sbi->s_committed_transaction);
+		list_splice_init(&sbi->s_active_transaction,
+					&sbi->s_closed_transaction);
+		sbi->s_last_transaction = handle->h_transaction->t_tid;
+	}
+	spin_unlock(&sbi->s_md_lock);
+
+	ext3_mb_free_committed_blocks(sb);
+}
+
+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
+				int group, int block, int count)
+{
+	struct ext3_buddy_group_blocks *db = e3b->bd_bd;
+	struct super_block *sb = e3b->bd_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_free_metadata *md;
+	int i;
+
+	ext3_lock_group(sb, group);
+	for (i = 0; i < count; i++) {
+		md = db->bb_md_cur;
+		if (md && db->bb_tid != handle->h_transaction->t_tid) {
+			db->bb_md_cur = NULL;
+			md = NULL;
+		}
+
+		if (md == NULL) {
+			ext3_unlock_group(sb, group);
+			md = kmalloc(sizeof(*md), GFP_KERNEL);
+			if (md == NULL)
+				return -ENOMEM;
+			md->num = 0;
+			md->group = group;
+
+			ext3_lock_group(sb, group);
+			if (db->bb_md_cur == NULL) {
+				spin_lock(&sbi->s_md_lock);
+				list_add(&md->list, &sbi->s_active_transaction);
+				spin_unlock(&sbi->s_md_lock);
+				db->bb_md_cur = md;
+				db->bb_tid = handle->h_transaction->t_tid;
+				mb_debug("new md 0x%p for group %u\n",
+							md, md->group);
+			} else {
+				kfree(md);
+				md = db->bb_md_cur;
+			}
+		}
+
+		BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS);
+		md->blocks[md->num] = block + i;
+		md->num++;
+		if (md->num == EXT3_BB_MAX_BLOCKS) {
+			/* no more space, put full container on a sb's list */
+			db->bb_md_cur = NULL;
+		}
+	}
+	ext3_unlock_group(sb, group);
+	return 0;
+}
+
+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
+			unsigned long block, unsigned long count,
+			int metadata, int *freed)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es;
+	unsigned long bit, overflow;
+	struct buffer_head *gd_bh;
+	unsigned long block_group;
+	struct ext3_sb_info *sbi;
+	struct super_block *sb;
+	struct ext3_buddy e3b;
+	int err = 0, ret;
+
+	*freed = 0;
+	sb = inode->i_sb;
+	if (!sb) {
+		printk ("ext3_free_blocks: nonexistent device");
+		return;
+	}
+
+	ext3_mb_poll_new_transaction(sb, handle);
+
+	sbi = EXT3_SB(sb);
+	es = EXT3_SB(sb)->s_es;
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > le32_to_cpu(es->s_blocks_count)) {
+		ext3_error (sb, "ext3_free_blocks",
+			    "Freeing blocks not in datazone - "
+			    "block = %lu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext3_debug("freeing block %lu\n", block);
+
+do_more:
+	overflow = 0;
+	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+		      EXT3_BLOCKS_PER_GROUP(sb);
+	bit = (block - le32_to_cpu(es->s_first_data_block)) %
+		      EXT3_BLOCKS_PER_GROUP(sb);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	brelse(bitmap_bh);
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
+	if (!gdp)
+		goto error_return;
+
+	if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
+	    in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
+	    in_range (block, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group) ||
+	    in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group))
+		ext3_error (sb, "ext3_free_blocks",
+			    "Freeing blocks in system zones - "
+			    "Block = %lu, count = %lu",
+			    block, count);
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext3_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	err = ext3_mb_load_buddy(sb, block_group, &e3b);
+	if (err)
+		goto error_return;
+
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < count; i++)
+			J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
+	}
+#endif
+	mb_clear_bits(bitmap_bh->b_data, bit, count);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+
+	if (metadata) {
+		/* blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed */
+		ext3_mb_free_metadata(handle, &e3b, block_group, bit, count);
+	} else { 
+		ext3_lock_group(sb, block_group);
+		mb_free_blocks(&e3b, bit, count);
+		ext3_unlock_group(sb, block_group);
+	}
+
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	gdp->bg_free_blocks_count =
+		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+	
+	ext3_mb_dirty_buddy(&e3b);
+	ext3_mb_release_desc(&e3b);
+
+	*freed = count;
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gd_bh);
+	if (!err) err = ret;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		goto do_more;
+	}
+	sb->s_dirt = 1;
+error_return:
+	brelse(bitmap_bh);
+	ext3_std_error(sb, err);
+	return;
+}
+
+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int free, ret = -ENOSPC;
+
+	BUG_ON(blocks < 0);
+	spin_lock(&sbi->s_reserve_lock);
+	free = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	if (blocks <= free - sbi->s_blocks_reserved) {
+		sbi->s_blocks_reserved += blocks;
+		ret = 0;
+	}
+	spin_unlock(&sbi->s_reserve_lock);
+	return ret;
+}
+
+void ext3_mb_release_blocks(struct super_block *sb, int blocks)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	BUG_ON(blocks < 0);
+	spin_lock(&sbi->s_reserve_lock);
+	sbi->s_blocks_reserved -= blocks;
+	WARN_ON(sbi->s_blocks_reserved < 0);
+	if (sbi->s_blocks_reserved < 0)
+		sbi->s_blocks_reserved = 0;
+	spin_unlock(&sbi->s_reserve_lock);
+}
+
+int ext3_new_block(handle_t *handle, struct inode *inode,
+		unsigned long goal, int *errp)
+{
+	int ret, len;
+
+	if (!test_opt(inode->i_sb, MBALLOC)) {
+		ret = ext3_new_block_old(handle, inode, goal, errp);
+		goto out;
+	}
+	len = 1;
+	ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp);
+out:
+	return ret;
+}
+
+
+void ext3_free_blocks(handle_t *handle, struct inode * inode,
+			unsigned long block, unsigned long count, int metadata)
+{
+	struct super_block *sb;
+	int freed;
+
+	sb = inode->i_sb;
+	if (!test_opt(sb, MBALLOC))
+		ext3_free_blocks_sb(handle, sb, block, count, &freed);
+	else
+		ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+	if (freed)
+		DQUOT_FREE_BLOCK(inode, freed);
+	return;
+}
+
Index: linux-2.6.11/fs/ext3/super.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/super.c	2005-03-04 18:26:39.000000000 +0300
+++ linux-2.6.11/fs/ext3/super.c	2005-03-04 22:17:20.000000000 +0300
@@ -377,6 +377,7 @@
 	struct ext3_super_block *es = sbi->s_es;
 	int i;
 
+	ext3_mb_release(sb);
  	ext3_ext_release(sb);
 	ext3_xattr_put_super(sb);
 	journal_destroy(sbi->s_journal);
@@ -582,7 +583,7 @@
 	Opt_commit, Opt_journal_update, Opt_journal_inum,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
 };
 
@@ -633,6 +634,8 @@
 	{Opt_barrier, "barrier=%u"},
 	{Opt_extents, "extents"},
 	{Opt_extdebug, "extdebug"},
+	{Opt_mballoc, "mballoc"},
+	{Opt_mballoc, "mbfactor=%u"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -934,6 +937,16 @@
 		case Opt_extdebug:
 			set_opt (sbi->s_mount_opt, EXTDEBUG);
 			break;
+		case Opt_mballoc:
+			set_opt (sbi->s_mount_opt, MBALLOC);
+			break;
+		case Opt_mbfactor:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_mb_factor = option;
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1620,6 +1633,7 @@
 		ext3_count_dirs(sb));
 
 	ext3_ext_init(sb);
+	ext3_mb_init(sb, needs_recovery);
 
 	lock_kernel();
 	return 0;
Index: linux-2.6.11/fs/ext3/Makefile
===================================================================
--- linux-2.6.11.orig/fs/ext3/Makefile	2005-03-04 18:26:39.000000000 +0300
+++ linux-2.6.11/fs/ext3/Makefile	2005-03-04 22:17:20.000000000 +0300
@@ -5,7 +5,8 @@
 obj-$(CONFIG_EXT3_FS) += ext3.o
 
 ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+	   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+	   mballoc.o
 
 ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
 ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
Index: linux-2.6.11/fs/ext3/balloc.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/balloc.c	2005-03-02 20:49:09.000000000 +0300
+++ linux-2.6.11/fs/ext3/balloc.c	2005-03-04 22:17:21.000000000 +0300
@@ -79,7 +79,7 @@
  *
  * Return buffer_head on success or NULL in case of failure.
  */
-static struct buffer_head *
+struct buffer_head *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
 	struct ext3_group_desc * desc;
@@ -454,24 +454,6 @@
 	return;
 }
 
-/* Free given blocks, update quota and i_blocks field */
-void ext3_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count)
-{
-	struct super_block * sb;
-	int dquot_freed_blocks;
-
-	sb = inode->i_sb;
-	if (!sb) {
-		printk ("ext3_free_blocks: nonexistent device");
-		return;
-	}
-	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-	if (dquot_freed_blocks)
-		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-	return;
-}
-
 /*
  * For ext3 allocations, we must not reuse any blocks which are
  * allocated in the bitmap buffer's "last committed data" copy.  This
@@ -1134,7 +1116,7 @@
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_block(handle_t *handle, struct inode *inode,
+int ext3_new_block_old(handle_t *handle, struct inode *inode,
 			unsigned long goal, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
Index: linux-2.6.11/fs/ext3/namei.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/namei.c	2005-03-02 20:49:09.000000000 +0300
+++ linux-2.6.11/fs/ext3/namei.c	2005-03-04 22:17:21.000000000 +0300
@@ -1629,7 +1629,7 @@
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
 		struct nameidata *nd)
 {
 	handle_t *handle; 
Index: linux-2.6.11/fs/ext3/inode.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/inode.c	2005-03-04 18:26:39.000000000 +0300
+++ linux-2.6.11/fs/ext3/inode.c	2005-03-04 22:17:21.000000000 +0300
@@ -571,7 +571,7 @@
 		ext3_journal_forget(handle, branch[i].bh);
 	}
 	for (i = 0; i < keys; i++)
-		ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
+		ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
 	return err;
 }
 
@@ -672,7 +672,7 @@
 	if (err == -EAGAIN)
 		for (i = 0; i < num; i++)
 			ext3_free_blocks(handle, inode, 
-					 le32_to_cpu(where[i].key), 1);
+					 le32_to_cpu(where[i].key), 1, 1);
 	return err;
 }
 
@@ -1832,7 +1832,7 @@
 		}
 	}
 
-	ext3_free_blocks(handle, inode, block_to_free, count);
+	ext3_free_blocks(handle, inode, block_to_free, count, 1);
 }
 
 /**
@@ -2005,7 +2005,7 @@
 				ext3_journal_test_restart(handle, inode);
 			}
 
-			ext3_free_blocks(handle, inode, nr, 1);
+			ext3_free_blocks(handle, inode, nr, 1, 1);
 
 			if (parent_bh) {
 				/*
Index: linux-2.6.11/fs/ext3/extents.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/extents.c	2005-03-04 18:26:39.000000000 +0300
+++ linux-2.6.11/fs/ext3/extents.c	2005-03-04 22:18:33.000000000 +0300
@@ -774,7 +774,7 @@
 		for (i = 0; i < depth; i++) {
 			if (!ablocks[i])
 				continue;
-			ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
+			ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
 		}
 	}
 	kfree(ablocks);
@@ -1431,7 +1431,7 @@
 			path->p_idx->ei_leaf);
 	bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
 	ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
-	ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
+	ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
 	return err;
 }
 
@@ -1940,7 +1940,7 @@
 	int needed = ext3_remove_blocks_credits(tree, ex, from, to);
 	handle_t *handle = ext3_journal_start(tree->inode, needed);
 	struct buffer_head *bh;
-	int i;
+	int i, metadata = 0;
 
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -1959,6 +1959,8 @@
 		spin_unlock(&sbi->s_ext_stats_lock);
 	}
 #endif
+	if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
+		metadata = 1;
 	if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
 		/* tail removal */
 		unsigned long num, start;
@@ -1970,7 +1972,7 @@
 			bh = sb_find_get_block(tree->inode->i_sb, start + i);
 			ext3_forget(handle, 0, tree->inode, bh, start + i);
 		}
-		ext3_free_blocks(handle, tree->inode, start, num);
+		ext3_free_blocks(handle, tree->inode, start, num, metadata);
 	} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
 		printk("strange request: removal %lu-%lu from %u:%u\n",
 			from, to, ex->ee_block, ex->ee_len);
Index: linux-2.6.11/fs/ext3/xattr.c
===================================================================
--- linux-2.6.11.orig/fs/ext3/xattr.c	2005-03-02 20:49:09.000000000 +0300
+++ linux-2.6.11/fs/ext3/xattr.c	2005-03-04 22:17:21.000000000 +0300
@@ -484,7 +484,7 @@
 		ea_bdebug(bh, "refcount now=0; freeing");
 		if (ce)
 			mb_cache_entry_free(ce);
-		ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
+		ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
 		get_bh(bh);
 		ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
 	} else {
@@ -804,7 +804,7 @@
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
 getblk_failed:
-				ext3_free_blocks(handle, inode, block, 1);
+				ext3_free_blocks(handle, inode, block, 1, 1);
 				error = -EIO;
 				goto cleanup;
 			}
Index: linux-2.6.11/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.11.orig/include/linux/ext3_fs.h	2005-03-04 18:26:39.000000000 +0300
+++ linux-2.6.11/include/linux/ext3_fs.h	2005-03-04 22:17:21.000000000 +0300
@@ -57,6 +57,14 @@
 #define ext3_debug(f, a...)	do {} while (0)
 #endif
 
+#define EXT3_MULTIBLOCK_ALLOCATOR	1
+
+#define EXT3_MB_HINT_MERGE		1
+#define EXT3_MB_HINT_RESERVED		2
+#define EXT3_MB_HINT_METADATA		4
+#define EXT3_MB_HINT_FIRST		8
+#define EXT3_MB_HINT_BEST		16
+
 /*
  * Special inodes numbers
  */
@@ -363,6 +371,7 @@
 #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT3_MOUNT_EXTENTS		0x40000	/* Extents support */
 #define EXT3_MOUNT_EXTDEBUG		0x80000	/* Extents debug */
+#define EXT3_MOUNT_MBALLOC		0x100000/* Buddy allocation support */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -723,7 +732,7 @@
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
 extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
 extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
-			      unsigned long);
+			      unsigned long, int);
 extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
 				 unsigned long, unsigned long, int *);
 extern unsigned long ext3_count_free_blocks (struct super_block *);
@@ -843,6 +852,37 @@
 extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
 extern int ext3_ext_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
 
+/* mballoc.c */
+extern int ext3_mb_init(struct super_block *, int);
+extern int ext3_mb_release(struct super_block *);
+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
+extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
+
+/* writeback.c */
+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to);
+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
+extern int ext3_wb_releasepage(struct page *, int);
+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+extern void ext3_wb_init(struct super_block *);
+extern void ext3_wb_release(struct super_block *);
+
+/* writeback.c */
+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to);
+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
+extern int ext3_wb_releasepage(struct page *, int);
+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+extern void ext3_wb_init(struct super_block *);
+extern void ext3_wb_release(struct super_block *);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_EXT3_FS_H */
Index: linux-2.6.11/include/linux/ext3_fs_sb.h
===================================================================
--- linux-2.6.11.orig/include/linux/ext3_fs_sb.h	2005-03-04 18:26:39.000000000 +0300
+++ linux-2.6.11/include/linux/ext3_fs_sb.h	2005-03-04 22:17:21.000000000 +0300
@@ -21,9 +21,29 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#include <linux/list.h>
 #endif
 #include <linux/rbtree.h>
 
+#define EXT3_BB_MAX_BLOCKS	30
+struct ext3_free_metadata {
+	unsigned short group;
+	unsigned short num;
+	unsigned short blocks[EXT3_BB_MAX_BLOCKS];
+	struct list_head list;
+};
+
+struct ext3_buddy_group_blocks {
+	__u32		bb_bitmap;
+	__u32		bb_buddy;
+	spinlock_t	bb_lock;
+	unsigned long 	bb_tid;
+	struct ext3_free_metadata *bb_md_cur;
+	unsigned short	bb_first_free;
+	unsigned short	bb_free;
+	unsigned 	bb_counters[];
+};
+
 /*
  * third extended-fs super-block data in memory
  */
@@ -86,6 +106,27 @@
 	spinlock_t s_ext_stats_lock;
 	unsigned long s_ext_blocks;
 	unsigned long s_ext_extents;
+
+	/* for buddy allocator */
+	struct ext3_buddy_group_blocks **s_buddy_blocks;
+	struct inode *s_buddy;
+	long s_blocks_reserved;
+	spinlock_t s_reserve_lock;
+	struct list_head s_active_transaction;
+	struct list_head s_closed_transaction;
+	struct list_head s_committed_transaction;
+	spinlock_t s_md_lock;
+	tid_t s_last_transaction;
+	int s_mb_factor;
+
+	/* stats for buddy allocator */
+	spinlock_t s_bal_lock;
+	unsigned long s_bal_reqs;	/* number of reqs with len > 1 */
+	unsigned long s_bal_success;	/* we found long enough chunks */
+	unsigned long s_bal_allocated;	/* in blocks */
+	unsigned long s_bal_ex_scanned;	/* total extents scanned */
+	unsigned long s_bal_goals;	/* goal hits */
+	unsigned long s_bal_breaks;	/* too long searches */
 };
 
 #endif	/* _LINUX_EXT3_FS_SB */

