/*
 * nasd_bmap.c
 *
 * NASD object map management
 *
 * Author: Jim Zelenka
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1997,1998,1999.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#include <nasd/nasd_options.h>
#include <nasd/nasd_drive_options.h>
#define NASD_OD_INCLUDE_COUNTS 1
#include <nasd/nasd_types.h>
#include <nasd/nasd_freelist.h>
#include <nasd/nasd_itypes.h>
#include <nasd/nasd_mem.h>
#include <nasd/nasd_cache.h>
#include <nasd/nasd_common.h>
#include <nasd/nasd_layout.h>

/*
 * nasd_od_bmap()-internal macros
 */

/*
 * GETMAXC: assign max clustering to _m_, using _im_ as
 * the parameterized size. Checks flags to see if we're
 * aligning. 0 for _im_ = no limit. Uses _b_ as current
 * endpoint. This is for forward clustering.
 */
#define GETMAXFC(_m_,_im_,_b_) { \
  nasd_blkno_t mod, inclust; \
\
  if (_im_) { \
    if (flags&NASD_ODC_B_ALIGN) { \
      mod = _im_; \
      inclust = (_b_) % mod; \
      _m_ = ((_im_) - 1) - inclust; \
    } \
    else { \
      _m_ = NASD_MIN(_im_,NASD_ODC_MAX_CLUSTER); \
    } \
  } \
  else { \
    _m_ = NASD_ODC_MAX_CLUSTER; \
  } \
}

/*
 * GETMAXC: assign max clustering to _m_, using _im_ as
 * the parameterized size. Checks flags to see if we're
 * aligning. 0 for _im_ = no limit. Uses _b_ as current
 * endpoint. This is for reverse clustering.
 */
#define GETMAXRC(_m_,_im_,_b_) { \
  nasd_blkno_t mod; \
\
  if (_im_) { \
    if (flags&NASD_ODC_B_ALIGN) { \
      mod = _im_; \
      _m_ = (_b_) % mod; \
    } \
    else { \
      _m_ = NASD_MIN(_im_,NASD_ODC_MAX_CLUSTER); \
    } \
  } \
  else { \
    _m_ = NASD_ODC_MAX_CLUSTER; \
  } \
}

#define BLOCKVAL(_a_,_i_) ((_a_) ? ((_a_ [_i_]).blkno) : 0)
#define BLKPTRS(_lvl_) ((_lvl_) ? NASD_OD_IPTRS_PER_BLOCK : NASD_OD_DPTRS_PER_BLOCK)

NASD_INLINE nasd_blkno_t
nasd_od_bm_nextf(
  nasd_odc_exle_t  *exl,
  int              *exln)
{
  nasd_odc_exlist_ent_t *e;
  nasd_blkno_t r;

  if (exl->unused == NULL)
    return(0);
  r = exl->unused->range.first + exl->usedc;
  exl->usedc++;
  exl->total_usedc++;
  if (r == exl->unused->range.last) {
    exl->usedc = 0;
    e = exl->unused;
    exl->unused = e->next;
    e->next = exl->used;
    exl->used = e;
  }
  *exln += 1;

  NASD_ASSERT(r <= nasd_od_blocks);
  return(r);
}


/*
 * nasd_od_fbmap
 *
 * Internal mapping function. !!! RECURSIVE !!!
 * Takes an array of pointers, and an indication of
 * the indirection level. Forces block faulting
 * where necessary to fill range.
 *
 * pe should be locked
 */
nasd_status_t
nasd_od_fbmap(
  nasd_odc_ent_t   *node_ent,       /* ent containing node */
  nasd_odc_ent_t   *pe,             /* parent ent (contains blk array) */
  void             *blocks,         /* current block array */
  int               nblocks,        /* size of this array */
  nasd_oblkno_t     in_lblkno,      /* first logical block to map,
                                     * where 0 == beginning leaf of
                                     * this array */
  nasd_oblkno_t     in_lblkcnt,     /* blocks to map */
  int               level,          /* current indirection level */
  int               partnum,        /* partition number */
  int               flags,          /* operational flags */
  nasd_odc_exle_t  *exl,            /* newly allocated blocks */
  int              *exln,           /* position in exl set (in+out) */
  int              *bp,             /* position in blkp array (in+out) */
  nasd_blkrec_t    *blkp,           /* out array of pointers */
  nasd_offset_t    *offsetp,        /* offset within NASD object (in+out) */
  nasd_blkno_t     *last_fb_p)      /* last block mapped */
{
  nasd_od_indirect_ptr_t *indirect;
  nasd_od_direct_ptr_t *direct;
  nasd_odc_ent_t *nze, *ent;
  nasd_oblkno_t n, ni, pn;
  nasd_odc_icpart_t *icp;
  nasd_od_part_t *part;
  nasd_blkno_t nzb;
  nasd_status_t rc;
  int b, d;

#if NASD_OD_EXT_PTR > 0
  int grabbed_reference = NASD_FALSE;
#endif /*  NASD_OD_EXT_PTR > 0 */

  if (in_lblkcnt <= 0)
    return(NASD_BAD_BLKCNT);
  if (in_lblkcnt > NASD_ODC_MAX_BMAP)
    return(NASD_BAD_BLKCNT);

  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  b = *bp;
  n = in_lblkno;
  d = 0;

  if (level == 0) {
    direct = (nasd_od_direct_ptr_t *)blocks;
    /*
     * Direct
     */
    while((b<in_lblkcnt)&&(n<nblocks)) {
      NASD_ASSERT(blkp[b].blkno == direct[n].blkno);
      if (direct[n].blkno == 0) {
        /*
         * Drop in our next zero block.
         */
        nzb = nasd_od_bm_nextf(exl, exln);
        NASD_ASSERT(nzb <= nasd_od_blocks);
        if (nzb == 0) {
          /*
           * Out of space.
           */
          *bp = b;
          return(NASD_NO_SPACE);
        }
        blkp[b].blkno = direct[n].blkno = nzb;
#if NASD_OD_EXT_PTR > 0
        /* Grab (hopefully) one reference per direct block we handle 
         -- assumption is that we give up everything in the
         blkp array at once  so one reference is fine.
         
         Also store a pointer to this blkno direct pointer block in the 
         referenced cache entry so we can r/w the digest
        */
        if (blkp[b].blkno != 0) { 
          blkp[b].digest=direct[n].digest;
          blkp[b].odc_entp=pe;
          if (grabbed_reference != NASD_TRUE) {
            nasd_odc_block_ref(pe);
            blkp[b].flags |= NASD_HAS_BLOCK_REF;    
            grabbed_reference=NASD_TRUE;
          } else {
            blkp[b].flags &= ~(NASD_HAS_BLOCK_REF);    
          }
        } else {
          blkp[b].flags &= ~(NASD_HAS_BLOCK_REF);    
        }

#endif /*  NASD_OD_EXT_PTR > 0 */
        
        rc = nasd_odc_block_get(node_ent, (nasd_blkno_t)nzb,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK, &nze,
          node_ent->identifier, *offsetp,
          NASD_ODC_T_DATA, NULL);
        if (rc) {
          *bp = b;
          return(rc);
        }
        NASD_ODC_LOCK_BLOCK(nze);
        nasd_odc_wait_not_busy(nze);
        /* mark it uninitialized */
        nze->data_flags |= NASD_CD_NZ;
        NASD_ODC_UNLOCK_BLOCK(nze);
        nasd_odc_block_release(nze);
        d++;
      }
      *offsetp += NASD_OD_BASIC_BLOCKSIZE;
      n++;
      b++;
    }
    if (d) {
      /* we dirtied this block */
      nasd_odc_dirty_ent(pe);
    }
    *bp = b;
    return(NASD_SUCCESS);
  }

  /*
   * We're still indirect. Drop in and initialize blocks as
   * necessary.
   */
  indirect = (nasd_od_indirect_ptr_t *)blocks;
  ni = n / nasd_od_ilvl_leaves[level];
  NASD_ASSERT(ni >= 0);
  NASD_ASSERT(ni < nblocks);
  while((b<in_lblkcnt)&&(ni<nblocks)) {
    pn = n - (ni * nasd_od_ilvl_leaves[level]);
    ent = NULL;
    if (indirect[ni].blkno == 0) {
      /*
       * Drop in block of zeros. Will be reinitted in recursive
       * calls.
       */
      nzb = nasd_od_bm_nextf(exl, exln);
      NASD_ASSERT(nzb <= nasd_od_blocks);
      if (nzb == 0) {
        *bp = b;
        return(NASD_NO_SPACE);
      }
      d++;
      indirect[ni].blkno = nzb;
      rc = nasd_odc_block_get(node_ent, (nasd_blkno_t)nzb,
        NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK, &nze,
        node_ent->identifier, 0,
        NASD_ODC_T_IND, NULL);
      if (rc) {
        *bp = b;
        return(rc);
      }
      NASD_ODC_LOCK_BLOCK(nze);
      nasd_odc_wait_not_busy(nze);
      /* Initialize it */
      bzero((char *)nze->data.buf, NASD_OD_BASIC_BLOCKSIZE);
      nze->data_flags &= ~(NASD_CD_INVALID|NASD_CD_NZ);
      ent = nze;
    }
    if (ent == NULL) {
      rc = nasd_odc_block_get(node_ent, indirect[ni].blkno,
        NASD_ODC_L_LOAD|NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
        &ent, node_ent->identifier, 0, NASD_ODC_T_IND, NULL);
      if (rc) {
        *bp = b;
        return(rc);
      }
      NASD_ODC_LOCK_BLOCK(ent);
      nasd_odc_wait_not_busy_invalid(ent);
    }
    rc = nasd_od_fbmap(node_ent, ent, ent->data.blk, BLKPTRS(level-1),
      pn, in_lblkcnt, level-1, partnum, flags, exl, exln, &b, blkp,
      offsetp, last_fb_p);
    NASD_ODC_UNLOCK_BLOCK(ent);
    nasd_odc_block_release(ent);
    if (rc) {
      *bp = b;
      return(rc);
    }
    n += nasd_od_ilvl_leaves[level];
    ni++;
    NASD_ASSERT(ni >= 0);
    NASD_ASSERT(ni <= nblocks);
  }
  if (d) {
    nasd_odc_dirty_ent(pe);
  }
  *bp = b;
  return(NASD_SUCCESS);
}


/*
 * nasd_od_ibmap
 *
 * Internal mapping function. !!! RECURSIVE !!!
 * Takes an array of pointers, and an indication of
 * the indirection level. Resolves a partial range
 * of the file. 
 *
 * pe should be locked
 */
nasd_status_t
nasd_od_ibmap(
  nasd_odc_ent_t  *node_ent,       /* ent containing node */
  nasd_odc_ent_t  *pe,             /* parent entry (contains blocks array) */
  void            *blocks,         /* current block array */
  int              nblocks,        /* size of this array */
  nasd_oblkno_t    in_lblkno,      /* first logical block to map,
                                    * where 0 == beginning leaf of
                                    * this array */
  nasd_oblkno_t    in_lblkcnt,     /* blocks to map */
  int              level,          /* current indirection level */
  nasd_blkcnt_t    in_beforemax,   /* max blocks to cluster backwards */
  nasd_blkcnt_t    in_aftermax,    /* max blocks to cluster forwards */
  int              partnum,        /* partition number */
  int              flags,          /* operational flags */
  nasd_blkno_t    *firstbp,        /* find last non-zero block */
  int             *bp,             /* position in blkp array (in+out) */
  nasd_blkrec_t   *blkp,           /* out array of pointers */
  nasd_blkcnt_t   *blocks_beforep, /* contig blocks before *blkp */
  nasd_blkcnt_t   *blocks_afterp,  /* contig blocks after *blkp */
  int             *zp,             /* zero-block counter pointer */
  int             *zpa)            /* zero-block array counter pointer */
{
  nasd_od_indirect_ptr_t *indirect;
  nasd_od_direct_ptr_t *direct;
  nasd_blkno_t maxclust, ind;
  nasd_oblkno_t n, ni, pn;
  nasd_odc_icpart_t *icp;
  int b, i, first, last;
  nasd_od_part_t *part;
  nasd_odc_ent_t *ent;
  nasd_status_t rc;
#if NASD_OD_EXT_PTR > 0
  int grabbed_reference = NASD_FALSE;
#endif /*  NASD_OD_EXT_PTR > 0 */

  if (in_lblkcnt <= 0)
    return(NASD_BAD_BLKCNT);
  if (in_lblkcnt > NASD_ODC_MAX_BMAP)
    return(NASD_BAD_BLKCNT);

  NASD_ASSERT(level >= 0);
  NASD_ASSERT(level < NASD_OD_ILVLS);

  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  b = *bp;
  n = in_lblkno;
  if (*bp)
    first = 0;
  else
    first = 1; last = 0;

  if (level == 0) {
    direct = (nasd_od_direct_ptr_t *)blocks;
    /*
     * We're direct. Party on.
     */
    while((b<in_lblkcnt)&&(n<nblocks)) {
      blkp[b].blkno = BLOCKVAL(direct,n);
#if NASD_OD_EXT_PTR > 0
        /* Grab (hopefully) one reference per direct block we handle 
         -- assumption is that we give up everything in the
         blkp array at once  so one reference is fine.
         
         Also store a pointer to this blkno direct pointer block in the 
         referenced cache entry so we can r/w the digest
        */
     if (blkp[b].blkno != 0) { 
       blkp[b].digest=direct[n].digest;
       blkp[b].odc_entp=pe; 
       if (grabbed_reference != NASD_TRUE) {
         nasd_odc_block_ref(pe);
         blkp[b].flags |= NASD_HAS_BLOCK_REF;    
         grabbed_reference=NASD_TRUE;
       } else {
         blkp[b].flags &= ~(NASD_HAS_BLOCK_REF);    
       }
     } else {
       blkp[b].flags &= ~(NASD_HAS_BLOCK_REF);    
     }
#endif /*  NASD_OD_EXT_PTR > 0 */
        
      if (blkp[b].blkno == 0) {
        *zp += 1; 
        zpa[0]++;
      }
      else if ((*zp) == 0) {
        *firstbp = blkp[b].blkno;
      }
      n++;
      b++;
    }
    if (b >= in_lblkcnt)
      last = 1;
    /*
     * Reverse-cluster
     */
    if (direct && blocks_beforep && first && blkp[0].blkno) {
      GETMAXRC(maxclust,in_beforemax,blkp[0].blkno);
      for(i=in_lblkno-1;((i>=0)&&((*blocks_beforep)<maxclust));i--) {
        if (direct[i].blkno && (direct[i].blkno+1 == direct[i+1].blkno)) {
          *blocks_beforep += 1;
        }
        else {
          break;
        }
      }
    }
    /*
     * Forward-cluster
     */
    if (direct && blocks_afterp && last && bp[b-1]) {
      GETMAXFC(maxclust,in_aftermax,bp[b-1]);
      for(i=n;((i<nblocks)&&((*blocks_afterp)<maxclust));i++) {
        if (direct[i].blkno && (direct[i].blkno-1 == direct[i-1].blkno)) {
          *blocks_afterp += 1;
        }
        else {
          break;
        }
      }
    }
    *bp = b;
    return(NASD_SUCCESS);
  }

  /*
   * We're at some level of indirection. Crank through our current
   * block, pulling in subhierarchies of blocks.
   */
  indirect = (nasd_od_indirect_ptr_t *)blocks;
  ni = n / nasd_od_ilvl_leaves[level];
  NASD_ASSERT(ni >= 0);
  NASD_ASSERT(ni < nblocks);
 while((b<in_lblkcnt)&&(ni<nblocks)) {
    ind = BLOCKVAL(indirect,ni);
    pn = n - (ni * nasd_od_ilvl_leaves[level]);
    if (ind == 0) {
      *zp += 1;
      zpa[0]++;
      blocks_beforep = blocks_afterp = NULL;
      /*
       * XXX
       * The right thing to do here is to figure out how many leaves
       * we have below us in the tree, and bzero that chunk of blocks
       * out of blkp, also updating the zero-counters. Should fix this.
       */
      rc = nasd_od_ibmap(node_ent, NULL, NULL, BLKPTRS(level-1), pn,
        in_lblkcnt, level - 1, 0, 0, partnum, flags, firstbp, &b, blkp,
        NULL, NULL, zp, zpa);
      if (rc) {
        *bp = b;
        return(rc);
      }
    }
    else {
      if ((*zp) == 0) {
        *firstbp = ind;
      }
      rc = nasd_odc_block_get(node_ent, ind,
        NASD_ODC_L_LOAD|NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
        &ent, node_ent->identifier, 0, NASD_ODC_T_IND, NULL);
      if (rc) {
        *bp = b;
        return(rc);
      }
      NASD_ODC_LOCK_BLOCK(ent);
      nasd_odc_wait_not_busy_invalid(ent);
      rc = nasd_od_ibmap(node_ent, ent, ent->data.blk, BLKPTRS(level-1),
        pn, in_lblkcnt, level-1, in_beforemax, in_aftermax, partnum, flags,
        firstbp, &b, blkp, blocks_beforep, blocks_afterp, zp, zpa);
      NASD_ODC_UNLOCK_BLOCK(ent);
      nasd_odc_block_release(ent);
      if (rc) {
        *bp = b;
        return(rc);
      }
    }
    n += nasd_od_ilvl_leaves[level];
    ni++;
    NASD_ASSERT(ni >= 0);
    NASD_ASSERT(ni <= nblocks);
  }

  *bp = b;
  return(NASD_SUCCESS);
}

/*
 * nasd_od_bmap
 *
 * Core layout routine. Given a logical block range within an object,
 * will compute the physical block numbers containing this block range
 * (0 if no such block- a la sparse allocation). Will also compute
 * the number of blocks physically contiguous to this range that
 * contain logically contiguous blocks within the object as well
 * (blocks_before and blocks_after). Note that there may be
 * additional, unreported contiguous blocks. This is because
 * the mapper will never block waiting for another block of
 * indirect pointers, just to generate this information.
 *
 * Caller should hold ref, lock on node ent.
 *
 * Flags:
 *  NASD_ODC_B_FAULT: ensure that this object has unique pointers
 *  to the target block. This may require copying an arbitrary
 *  number of blocks to an equal number of newly-allocated blocks.
 *  This is intended for COW-touching cases.
 *
 *  NASD_ODC_B_ALLOC: ensure the target block exists. This may
 *  require an arbitrary number of block allocations. This is
 *  intended for write-touching cases. If you specify this flag
 *  without specifying NASD_ODC_B_FAULT, think again. You probably
 *  just corrupted a few COW'd objects.
 *
 *  NASD_ODC_B_ALIGN: before and after maxs are alignment masks,
 *  not counts.
 */
nasd_status_t
nasd_od_bmap(
  nasd_odc_ent_t  *ne,               /* ent containing object node */
  nasd_oblkno_t    in_lblkno,        /* first logical blkno to map */
  nasd_oblkcnt_t   in_lblkcnt,       /* logical blocks to map */
  nasd_blkcnt_t    in_beforemax,     /* max blocks to cluster backwards */
  nasd_blkcnt_t    in_aftermax,      /* max blocks to cluster forwards */
  int              partnum,          /* partition number */
  int              flags,            /* operational flags */
  nasd_blkrec_t   *blkp,             /* pointer to physical block array */
  nasd_blkcnt_t   *blocks_beforep,   /* contig blocks before *blkp */
  nasd_blkcnt_t   *blocks_afterp,    /* contig blocks after *blkp */
  int             *blocks_to_allocp) /* number of blocks needed to allocate */
{
  int lvl, i, zb[NASD_OD_ILVLS], zeroblks, b, exln, flvl, needblks;
  nasd_blkno_t firstb, last_fb, hint_blk, ckblk;
  nasd_oblkno_t lblkno, flblkno, tlblkno;
  nasd_odc_exlist_ent_t *exle, *pre_exle;
  nasd_blkcnt_t pbs_got, alloc_got;
  nasd_od_direct_ptr_t *direct;
  nasd_offset_t last_fault;
  nasd_oblkcnt_t lblkcnt;
  nasd_odc_icpart_t *icp;
  nasd_status_t rc, rc2;
  nasd_od_part_t *part;
  nasd_offset_t offset;
  nasd_odc_exle_t exl;
  nasd_od_node_t *np;

  NASD_ODC_CHECK_NODE_ENT(ne);

  if (in_lblkcnt <= 0)
    return(NASD_BAD_BLKCNT);
  if (in_lblkcnt > NASD_ODC_MAX_BMAP)
    return(NASD_BAD_BLKCNT);

  bzero((char *)zb, sizeof(zb));
  if (blocks_beforep)
    *blocks_beforep = 0;
  if (blocks_afterp)
    *blocks_afterp = 0;
  lblkno = in_lblkno;
  lvl = (-1);
  flvl = (-1);
  zeroblks = 0;

  firstb = ne->blkno;

  np = ne->data.node;
  NASD_ASSERT(partnum == NASD_OD_PARTNUM(np));
  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  /*
   * Find level of first block.
   */
  for(i=0;i<NASD_OD_ILVLS;i++) {
    if (lblkno < nasd_od_ilvl_ptrs[i]) {
      flvl = lvl = i;
      break;
    }
    lblkno -= nasd_od_ilvl_ptrs[i];
  }
  if (lvl < 0) {
    /*
     * Exceeds maximum object capacity.
     */
    return(NASD_BAD_OFFSET);
  }
  flblkno = lblkno;

  b = 0; /* position in blkp array */
  tlblkno = lblkno; /* blocks mapped in last pass */
  lblkcnt = in_lblkcnt;

  do {
    NASD_ASSERT(lvl<NASD_OD_ILVLS);
    rc = nasd_od_ibmap(ne, ne, &np->ptrs[nasd_od_ilvl_ptr_psum[lvl]],
      nasd_od_ilvl_top_ptrs[lvl], tlblkno, lblkcnt, lvl,
      in_beforemax, in_aftermax, partnum, flags, &firstb, &b, blkp,
      blocks_beforep, blocks_afterp, &zeroblks, zb);
    if (rc) {
      return(rc);
    }
    tlblkno = 0;
    lvl++;
  } while ((b < in_lblkcnt) && (rc == NASD_SUCCESS));

  b = 0; /* position in blkp array */
  tlblkno = lblkno = flblkno;
  lvl = flvl;
  lblkcnt = in_lblkcnt;

  if (blocks_to_allocp)
    *blocks_to_allocp = zeroblks;

  rc = NASD_SUCCESS;
  if (zeroblks && (flags&NASD_ODC_B_ALLOC)) {
    needblks = zeroblks;

    /*
     * Get blocks first from preallocation, then
     * allocate any more blocks if that's not enough.
     *
     * zeroblks is how many blocks we need to fault
     * needblks is how many blocks we need to allocate
     * pbs_got is how many preallocated blocks we used
     */
    NASD_ODC_ICPART_LOCK_WRITE(icp);

    rc = nasd_od_layout_get_prealloc(partnum, ne, needblks, &pre_exle, &pbs_got);
    if (rc != NASD_SUCCESS) {
      NASD_PANIC();
    }
    needblks -= pbs_got;

    /*
     * We give a hint, which is the block we will be following. That
     * block is either the last prealloc block if we have prealloc
     * blocks, or the last mapped block we hit, or the node block.
     */
    if (pre_exle) {
      hint_blk = pre_exle->range.last;
    }
    else {
      if (zeroblks < in_lblkcnt) {
        hint_blk = firstb;
      }
      else {
        /*
         * We are faulting all our blocks- figure out where we would
         * like to go after.
         *
         * XXX modify this to check incore only indirect blocks
         * all the way down.
         */
        if (in_lblkno == 0) {
          hint_blk = ne->blkno;
        }
        else if (in_lblkno < nasd_od_ilvl_ptrs[0]) {
          hint_blk = ne->blkno;
          direct = (nasd_od_direct_ptr_t *)np->ptrs;
          for(ckblk=in_lblkno-1;ckblk>0;ckblk--) {
            if (direct[ckblk].blkno) {
              hint_blk = direct[ckblk].blkno;
              break;
            }
          }
          if ((ckblk == 0) && (direct[0].blkno)) {
            hint_blk = direct[0].blkno;
          }
        }
        else {
          /* XXX should be checking indirect blocks here */
          hint_blk = firstb;
        }
      }
    }
    rc = nasd_od_layout_alloc_blocks(partnum, ne, needblks, hint_blk,
      &exle, &alloc_got);
    if (rc) {
      NASD_ODC_ICPART_UNLOCK_WRITE(icp);
      return(rc);
    }
    rc = nasd_odc_ref_ranges(partnum, exle, 1, NULL, NASD_ODC_REF_NOFLAGS);
    if (rc != NASD_SUCCESS) {
      NASD_PANIC();
    }
    if (pre_exle) {
      pre_exle->next = exle;
      exle->prev = pre_exle;
      exle = pre_exle;
    }
    nasd_part_modified(partnum);
    NASD_ODC_ICPART_UNLOCK_WRITE(icp);
    np->blocks_allocated += alloc_got + pbs_got;
    exl.unused = exle;
    exl.used = NULL;
    exl.usedc = 0;
    exl.total_usedc = 0;
    exln = 0;
    offset = nasd_od_ilvl_offset[lvl];
    offset += tlblkno * NASD_OD_BASIC_BLOCKSIZE;
    do {
      last_fb = 0;
      rc = nasd_od_fbmap(ne, ne, &np->ptrs[nasd_od_ilvl_ptr_psum[lvl]],
        nasd_od_ilvl_top_ptrs[lvl], tlblkno, lblkcnt, lvl,
        partnum, flags, &exl, &exln, &b, blkp, &offset, &last_fb);
      if (rc)
        break;
      lvl++;
      tlblkno = 0;
      offset = nasd_od_ilvl_offset[lvl];
    } while (( b < in_lblkcnt) && (rc == NASD_SUCCESS));
    if (rc == NASD_SUCCESS) {
      NASD_ASSERT(exl.total_usedc == zeroblks);
      NASD_ASSERT(exl.unused == NULL);
      exle = exl.used;
      nasd_odc_release_extent_list(exle);
      last_fault = (in_lblkno + in_lblkcnt) * NASD_OD_BASIC_BLOCKSIZE;
      if (last_fault >= np->object_len) {
        np->last_block = last_fb;
      }
    }
    else {
      /*
       * We encountered an error establishing the mappings.
       * Release the blocks.
       */
      np->blocks_allocated -= alloc_got + pbs_got;
      rc2 = nasd_odc_ref_ranges(partnum, exle, -1, NULL, NASD_ODC_REF_EJECT);
      if (rc2) {
        NASD_PANIC();
      }
      nasd_odc_release_extent_list(exle);
    }
  }

  return(rc);
}

/*
 * nasd_od_ibunmap
 *
 * Internal unmapping function. !!! RECURSIVE !!!
 * Descend portion of node block tree, dereferencing
 * blocks as we come back up. Return NASD_SUCCESS
 * if all blocks deref'd. Return NASD_BLOCK_PARTIAL
 * if end-of-range reached, but passed-in array not
 * entirely consumed (used for zeroing out part of
 * a range- tells caller not to release indirect block).
 * Return other codes for error.
 */
nasd_status_t
nasd_od_ibunmap(
  nasd_odc_ent_t     *node_ent,     /* ent containing node */
  nasd_odc_icpart_t  *icp,          /* in-core partition handle */
  nasd_odc_ent_t     *pe,           /* parent entry (contains blocks array) */
  void               *blocks,       /* current block array */
  int                 nblocks,      /* size of this array */
  nasd_oblkno_t       in_lblkno,    /* first logical block to map,
                                     * where 0 == beginning leaf of
                                     * this array */
  nasd_oblkno_t       in_lblkcnt,   /* blocks to map */
  int                 level,        /* current indirection level */
  int                 partnum,      /* partition number */
  nasd_oblkcnt_t     *bp,           /* number of leaf blocks completed (in+out) */
  nasd_odc_exlist_t  *exl,          /* list of blocks to lose (in+out) */
  int                *marked)       /* number of blocks remarked (out) */
{
  nasd_od_indirect_ptr_t *indirect;
  nasd_od_direct_ptr_t *direct;
  nasd_oblkno_t n, ni, pn, fni;
  nasd_od_part_t *part;
  nasd_odc_ent_t *ent;
  nasd_blkno_t ind;
  nasd_oblkcnt_t b;
  nasd_status_t rc;
  int i, d, f;

  if (in_lblkcnt <= 0)
    return(NASD_BAD_BLKCNT);
  if (in_lblkcnt > (NASD_OD_MAX_OBJ_LEN/NASD_OD_BASIC_BLOCKSIZE))
    return(NASD_BAD_BLKCNT);

  NASD_ASSERT(level >= 0);
  NASD_ASSERT(level < NASD_OD_ILVLS);

  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  b = *bp;
  n = in_lblkno;
  if (level == 0) {
    direct = (nasd_od_direct_ptr_t *)blocks;
    /*
     * We're direct. Party on.
     */
    while((b<in_lblkcnt)&&(n<nblocks)) {
      if (direct[n].blkno) {
        *marked += 1;
        NASD_ODC_ICPART_LOCK_WRITE(icp);
        rc = nasd_odc_exlist_release_oneblock(exl, direct[n].blkno);
        NASD_ODC_ICPART_UNLOCK_WRITE(icp);
        if (rc) {
          *bp = b;
          return(rc);
        }
        direct[n].blkno = 0;
      }
      n++;
      b++;
    }
    rc = NASD_SUCCESS;
    if (in_lblkno) {
      for(i=0;i<in_lblkno;i++) {
        if (direct[i].blkno)
          rc = NASD_BLOCK_PARTIAL;
      }
    }
    if (rc == NASD_SUCCESS) {
      for(i=n;i<nblocks;i++)
        if (direct[i].blkno)
          rc = NASD_BLOCK_PARTIAL;
    }
    *bp = b;
    return(rc);
  }

  /*
   * We're at some level of indirection. Crank through our current
   * block, pulling in subhierarchies of blocks.
   */
  indirect = (nasd_od_indirect_ptr_t *)blocks;
  fni = ni = n / nasd_od_ilvl_leaves[level];
  NASD_ASSERT(ni >= 0);
  NASD_ASSERT(ni < nblocks);
  d = 0;
  f = 0;
  while((b<in_lblkcnt)&&(ni<nblocks)) {
    ind = indirect[ni].blkno;
    if (ind) {
      rc = nasd_odc_block_get(node_ent, ind,
        NASD_ODC_L_LOAD|NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
        &ent, node_ent->identifier, 0, NASD_ODC_T_IND, NULL);
      if (rc) {
        *bp = b;
        return(rc);
      }
      NASD_ODC_LOCK_BLOCK(ent);
      nasd_odc_wait_not_busy_invalid(ent);
      pn = n - (ni * nasd_od_ilvl_leaves[level]);
      rc = nasd_od_ibunmap(node_ent, icp, ent, ent->data.blk,
        BLKPTRS(level-1), pn, in_lblkcnt, level-1,
        partnum, &b, exl, marked);
      NASD_ODC_UNLOCK_BLOCK(ent);
      nasd_odc_block_release(ent);
      if (rc == NASD_SUCCESS) {
        /* entirely consumed sub-block */
        *marked += 1;
        indirect[ni].blkno = 0;
        d = 1;
        NASD_ODC_ICPART_LOCK_WRITE(icp);
        rc = nasd_odc_exlist_release_oneblock(exl, ind);
        NASD_ODC_ICPART_UNLOCK_WRITE(icp);
        if (rc) {
          *bp = b;
          return(rc);
        }
      }
      if (rc == NASD_BLOCK_PARTIAL) {
        f++;
        rc = NASD_SUCCESS;
      }
      if (rc) {
        *bp = b;
        return(rc);
      }
    }
    else {
      b += nasd_od_ilvl_leaves[level];
    }
    n += nasd_od_ilvl_leaves[level];
    ni++;
    NASD_ASSERT(ni >= 0);
    NASD_ASSERT(ni <= nblocks); /* = case will break us out of loop */
  }
  if (f == 0) {
    for(i=0;i<fni;i++) {
      if (indirect[i].blkno)
        f++;
    }
  }
  if (f == 0) {
    for(i=ni;i<nblocks;i++) {
      if (indirect[i].blkno)
        f++;
    }
  }
  if (f == 0) {
    /* consumed the entire block */
    rc = NASD_SUCCESS;
  }
  else {
    rc = NASD_BLOCK_PARTIAL;
  }

  if (d) {
    nasd_odc_dirty_ent(pe);
  }

  *bp = b;
  return(rc);
}

/*
 * nasd_od_bunmap
 *
 * Caller does not hold partition lock.
 *
 * Primary remove-data routine. Iterate through blocks in specified
 * range, releasing references to them.
 */
nasd_status_t
nasd_od_bunmap(
  nasd_odc_ent_t  *ne,
  nasd_oblkno_t    in_lblkno,
  nasd_oblkcnt_t   in_lblkcnt,
  int              partnum)
{
  nasd_oblkno_t lblkno, tlblkno, flblkno;
  nasd_oblkcnt_t lblkcnt, b;
  nasd_odc_exlist_t *exlist;
  nasd_odc_icpart_t *icp;
  nasd_od_part_t *part;
  int lvl, i, marked;
  nasd_od_node_t *np;
  nasd_status_t rc;

  if (in_lblkcnt <= 0)
    return(NASD_BAD_BLKCNT);
  if (in_lblkcnt > (NASD_OD_MAX_OBJ_LEN/NASD_OD_BASIC_BLOCKSIZE))
    return(NASD_BAD_BLKCNT);

  lblkno = in_lblkno;
  lvl = (-1);

  np = ne->data.node;
  NASD_ASSERT(partnum == NASD_OD_PARTNUM(np));
  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  /*
   * Find level of first block.
   */
  for(i=0;i<NASD_OD_ILVLS;i++) {
    if (lblkno < nasd_od_ilvl_ptrs[i]) {
      lvl = i;
      break;
    }
    lblkno -= nasd_od_ilvl_ptrs[i];
  }
  if (lvl < 0) {
    /*
     * Exceeds maximum object capacity.
     */
    return(NASD_BAD_OFFSET);
  }
  flblkno = lblkno;

  rc = nasd_odc_exlist_get(&exlist);
  if (rc)
    return(rc);

  b = 0;
  marked = 0;
  tlblkno = lblkno;
  lblkcnt = in_lblkcnt;

  nasd_odc_dirty_ent(ne);
  do {
    NASD_ASSERT(lvl<NASD_OD_ILVLS);
    rc = nasd_od_ibunmap(ne, icp, ne, &np->ptrs[nasd_od_ilvl_ptr_psum[lvl]],
      nasd_od_ilvl_top_ptrs[lvl], tlblkno, lblkcnt, lvl, partnum,
      &b, exlist, &marked);
    if (rc == NASD_BLOCK_PARTIAL) {
      /*
       * We don't care- our caller is responsible for dealing with
       * node, we just mark it dirty (above) and live on.
       */
      rc = NASD_SUCCESS;
    }
    tlblkno = 0;
    lvl++;
  } while ((b < in_lblkcnt) && (rc == NASD_SUCCESS));

  np->blocks_allocated -= marked;

  /* caller responsible for updating other fields */

  /*
   * Release references on blocks.
   */
  if (exlist->head.next != &exlist->head) {
    exlist->head.next->prev = NULL;
    exlist->head.prev->next = NULL;
    NASD_ODC_ICPART_LOCK_WRITE(icp);
    rc = nasd_odc_ref_ranges(partnum, exlist->head.next, -1, NULL,
      NASD_ODC_REF_EJECT);
    nasd_part_modified(partnum);
    NASD_ODC_ICPART_UNLOCK_WRITE(icp);

    exlist->head.prev->next = NULL;
    exlist->head.next->prev = NULL;
    nasd_odc_release_extent_list(exlist->head.next);
  }

  nasd_odc_exlist_free(exlist);

  return(rc);
}

/*
 * Give back phys block of highest logical block below.
 */
nasd_status_t
nasd_od_ibfind_last_block(
  nasd_odc_ent_t  *ne,
  int              partnum,
  void            *blocks,
  int              nblocks,
  int              high_ind,
  int              level,
  nasd_blkno_t    *out_blk)
{
  nasd_od_indirect_ptr_t *indirect;
  nasd_od_direct_ptr_t *direct;
  nasd_odc_ent_t *ent;
  nasd_status_t rc;
  int i;

  if (level == 0) {
    direct = (nasd_od_direct_ptr_t *)blocks;
    for(i=high_ind;i>=0;i--) {
      if (BLOCKVAL(direct,i)) {
        *out_blk = BLOCKVAL(direct,i);
        return(NASD_SUCCESS);
      }
    }
    return(NASD_SUCCESS);
  }

  indirect = (nasd_od_indirect_ptr_t *)blocks;
  for(i=high_ind;i;i--) {
    if (BLOCKVAL(indirect,i)) {
      *out_blk = BLOCKVAL(indirect,i);
      rc = nasd_odc_block_get(ne, BLOCKVAL(indirect,i),
        NASD_ODC_L_LOAD|NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
        &ent, ne->identifier, 0, NASD_ODC_T_IND, NULL);
      if (rc) {
        return(rc);
      }
      NASD_ODC_LOCK_BLOCK(ent);
      nasd_odc_wait_not_busy_invalid(ent);
      rc = nasd_od_ibfind_last_block(ne, partnum, ent->data.blk,
        BLKPTRS(level-1), BLKPTRS(level-1)-1, level-1, out_blk);
      NASD_ODC_UNLOCK_BLOCK(ent);
      nasd_odc_block_release(ent);
      if (rc) {
        return(rc);
      }
    }
  }

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_od_bfind_last_block(
  nasd_odc_ent_t  *ne,
  int              partnum,
  nasd_uint64      object_len)
{
  int lvl, i, j, lp, last_block_lvl, high_ind;
  nasd_blkno_t lblkno, last_block, out_blk;
  nasd_od_indirect_ptr_t *indirect;
  nasd_od_direct_ptr_t *direct;
  nasd_odc_icpart_t *icp;
  nasd_od_part_t *part;
  nasd_od_node_t *np;
  nasd_status_t rc;

  np = ne->data.node;
  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  last_block = ne->blkno;
  last_block_lvl = (-1);
  lvl = (-1);
  high_ind = (-1);

  if (object_len == 0) {
    np->last_block = ne->blkno;
    return(NASD_SUCCESS);
  }

  /*
   * Scan object backwards, looking for nonzero block pointer.
   */
  lblkno = (object_len + NASD_OD_BASIC_BLOCKSIZE - 1)
    / NASD_OD_BASIC_BLOCKSIZE;
  for(i=0;i<NASD_OD_ILVLS;i++) {
    if (lblkno < nasd_od_ilvl_ptrs[i]) {
      lvl = i;
      break;
    }
    lblkno -= nasd_od_ilvl_ptrs[i];
  }
  if (lvl < 0) {
    lvl = NASD_OD_ILVLS-1;
    lp = nasd_od_ilvl_top_ptrs[lvl]-1;
  }
  else {
    lp = (lblkno / nasd_od_ilvl_leaves[lvl]);
  }
  /*
   * lp is the offset in the pointerset at lvl
   * to start looking at
   */
  for(i=lvl;i;i--) {
    indirect = (nasd_od_indirect_ptr_t *)&np->ptrs[nasd_od_ilvl_ptr_psum[i]];
    for(j=lp;j>=0;j--) {
      if (BLOCKVAL(indirect,j)) {
        last_block = BLOCKVAL(indirect,j);
        last_block_lvl = i;
        high_ind = j;
        break;
      }
    }
    lp = BLKPTRS(lvl-1) - 1;
    if (last_block_lvl >= 0)
      break;
  }
  if (last_block_lvl < 0) {
    direct = (nasd_od_direct_ptr_t *)np->ptrs;
    for(j=lp;j>=0;j--) {
      if (BLOCKVAL(direct,j)) {
        last_block = BLOCKVAL(direct,j);
        last_block_lvl = 0;
        high_ind = j;
        break;
      }
    }
  }
  if (last_block_lvl > 0) {
    /*
     * Now must chase down tree to find the last leaf
     */
    out_blk = 0;
    rc = nasd_od_ibfind_last_block(ne, partnum,
      &np->ptrs[nasd_od_ilvl_ptr_psum[last_block_lvl]],
      nasd_od_ilvl_top_ptrs[last_block_lvl],
      high_ind, last_block_lvl, &out_blk);
    if (rc) {
      goto done;
    }
    if (out_blk) {
      last_block = out_blk;
    }
  }

  NASD_ASSERT(last_block_lvl >= 0);

  np->last_block = last_block;
  rc = NASD_SUCCESS;

done:
  return(rc);
}

/*
 *
 * In normal case, this is a noop. 
 *
 * When using the NASD_OD_EXT_PTR, this releases the references
 * we are holding on the direct pointer block
 *
 * Only the first time we touch a odc_entry per blkp array is it recorded.
 * Therefore, you should undo the entire blkp array at once 
 *
 * If compiled with security, we promise to turn off all the off_flags THEN
 * turn on the on_flags and notify the appropriate threads waiting.
 */
nasd_status_t
nasd_od_bmap_release(
  nasd_blkrec_t     *blkp,
  nasd_blkcnt_t      blkcount,
  nasd_odc_flags_t   off_flag,
  nasd_odc_flags_t   on_flag)
{
#if NASD_OD_EXT_PTR > 0 
  int i;

  for(i=0; i < blkcount; i++) {
    if (blkp[i].flags & NASD_HAS_BLOCK_REF) {

#if NASD_SECURE_RPCS_ENABLE > 0
      if (off_flag != 0  ||  on_flag != 0 ) 
        NASD_ODC_LOCK_BLOCK(blkp[i].odc_entp);
      blkp[i].odc_entp->data_flags &= ~(off_flag);
      blkp[i].odc_entp->data_flags |= on_flag;
      NASD_ODC_UNLOCK_BLOCK(blkp[i].odc_entp);
      NASD_BROADCAST_COND(blkp[i].odc_entp->cond);  
#endif /* NASD_SECURE_RPCS_ENABLE > 0 */

      nasd_odc_block_release(blkp[i].odc_entp);
    }
  }
#endif /* NASD_OD_EXT_PTR > 0 */

  return(NASD_SUCCESS);
}


/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */

