Windows-Server-2003/base/fs/ntfs/deviosup.c

/*++

Copyright (c) 1991  Microsoft Corporation

Module Name:

    DevIoSup.c

Abstract:

    This module implements the low lever disk read/write support for Ntfs

Author:

    Brian Andrew    BrianAn
    Tom Miller      TomM

Revision History:

--*/

#include "NtfsProc.h"
#include <ntddft.h>
#include <ntddvol.h>

#ifdef NTFS_RWC_DEBUG
extern BOOLEAN NtfsBreakOnConflict;
#endif

//
//  Number of pages to allocate a mdl on the stack for
//

#define NTFS_MDL_TRANSFER_PAGES 0x10

//
//  The Bug check file id for this module
//

#define BugCheckFileId                   (NTFS_BUG_CHECK_DEVIOSUP)

//
//  Local debug trace level
//

#define Dbg                              (DEBUG_TRACE_DEVIOSUP)

//
//  Define a tag for general pool allocations from this module
//

#undef MODULE_POOL_TAG
#define MODULE_POOL_TAG                  ('DFtN')

//
//  We need a special test for success, whenever we are seeing if we should
//  hot fix, because the FT driver returns one of two success codes if a read or
//  write failed to only one of the members.
//

#define FT_SUCCESS(STS) (NT_SUCCESS(STS) &&                                 \
                         ((STS) != STATUS_FT_READ_RECOVERY_FROM_BACKUP) &&  \
                         ((STS) != STATUS_FT_WRITE_RECOVERY))


//
//  Boolean to control whether we output HotFix information to the debugger.
//

#if DBG
BOOLEAN NtfsHotFixTrace = FALSE;
#define HotFixTrace(X) {if (NtfsHotFixTrace) KdPrint(X);}
#else
#define HotFixTrace(X) {NOTHING;}
#endif

//
//  Boolean to indicate whether to break on a decompress error
//

#if (defined BRIANDBG || defined SYSCACHE_DEBUG)
BOOLEAN NtfsStopOnDecompressError = TRUE;
#else
BOOLEAN NtfsStopOnDecompressError = FALSE;
#endif

//
//  Macro to collect the Disk IO stats.
//

#define CollectDiskIoStats(VCB,SCB,FUNCTION,COUNT) {                                           \
    PFILESYSTEM_STATISTICS FsStats = &(VCB)->Statistics[KeGetCurrentProcessorNumber()].Common; \
    ASSERT((SCB)->Fcb != NULL);                                                                \
    if (NtfsIsTypeCodeUserData( (SCB)->AttributeTypeCode ) &&                                  \
        !FlagOn( (SCB)->Fcb->FcbState, FCB_STATE_SYSTEM_FILE )) {                              \
        if ((FUNCTION) == IRP_MJ_WRITE) {                                                      \
            FsStats->UserDiskWrites += (COUNT);                                                \
        } else {                                                                               \
            FsStats->UserDiskReads += (COUNT);                                                 \
        }                                                                                      \
    } else if ((SCB) != (VCB)->LogFileScb) {                                                   \
        if ((FUNCTION) == IRP_MJ_WRITE) {                                                      \
            FsStats->MetaDataDiskWrites += (COUNT);                                            \
        } else {                                                                               \
            FsStats->MetaDataDiskReads += (COUNT);                                             \
        }                                                                                      \
    }                                                                                          \
}

//
//  Define a context for holding the context the compression state
//  for buffers.
//

typedef struct COMPRESSION_CONTEXT {

    //
    //  Pointer to allocated compression buffer, and its length
    //

    PUCHAR CompressionBuffer;
    ULONG CompressionBufferLength;

    //
    //  Saved fields from originating Irp
    //

    PMDL SavedMdl;
    PVOID SavedUserBuffer;

    //
    //  System Buffer pointer and offset in the System (user's) buffer
    //

    PVOID SystemBuffer;
    ULONG SystemBufferOffset;

    //
    //  IoRuns array in use.  This array may be extended one time
    //  in NtfsPrepareBuffers.
    //

    PIO_RUN IoRuns;
    ULONG AllocatedRuns;

    //
    //  Workspace pointer, so that cleanup can occur in the caller.
    //

    PVOID WorkSpace;

    //
    //  Write acquires the Scb.
    //

    BOOLEAN ScbAcquired;
    BOOLEAN FinishBuffersNeeded;

    //
    //  If this field is TRUE, it means the data has been copied from the
    //  system buffer to the compression buffer, and further operations,
    //  like compression, should look to the compression buffer for their
    //  source data.
    //

    BOOLEAN DataTransformed;

} COMPRESSION_CONTEXT, *PCOMPRESSION_CONTEXT;

//
//  Local support routines
//

VOID
NtfsAllocateCompressionBuffer (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB ThisScb,
    IN PIRP Irp,
    IN PCOMPRESSION_CONTEXT CompressionContext,
    IN OUT PULONG CompressionBufferLength
    );

VOID
NtfsDeallocateCompressionBuffer (
    IN PIRP Irp,
    IN PCOMPRESSION_CONTEXT CompressionContext,
    IN BOOLEAN Reinitialize
    );

LONG
NtfsCompressionFilter (
    IN PIRP_CONTEXT IrpContext,
    IN PEXCEPTION_POINTERS ExceptionPointer
    );

ULONG
NtfsPrepareBuffers (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN PVBO StartingVbo,
    IN ULONG ByteCount,
    IN ULONG StreamFlags,
    IN OUT PBOOLEAN Wait,
    OUT PULONG NumberRuns,
    OUT PCOMPRESSION_CONTEXT CompressionContext
    );

NTSTATUS
NtfsFinishBuffers (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN PVBO StartingVbo,
    IN ULONG ByteCount,
    IN ULONG NumberRuns,
    IN PCOMPRESSION_CONTEXT CompressionContext,
    IN ULONG StreamFlags
    );

VOID
NtfsMultipleAsync (
    IN PIRP_CONTEXT IrpContext,
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP MasterIrp,
    IN ULONG MultipleIrpCount,
    IN PIO_RUN IoRuns,
    IN UCHAR IrpSpFlags
    );

VOID
NtfsSingleAsync (
    IN PIRP_CONTEXT IrpContext,
    IN PDEVICE_OBJECT DeviceObject,
    IN LBO StartingLbo,
    IN ULONG ByteCount,
    IN PIRP Irp,
    IN UCHAR MajorFunction,
    IN UCHAR IrpSpFlags
    );

VOID
NtfsWaitSync (
    IN PIRP_CONTEXT IrpContext
    );

NTSTATUS
NtfsMultiAsyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    );

NTSTATUS
NtfsMultiSyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    );

NTSTATUS
NtfsSingleAsyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    );

NTSTATUS
NtfsSingleSyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    );

NTSTATUS
NtfsPagingFileCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID MasterIrp
    );

NTSTATUS
NtfsPagingFileNoAllocCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Context
    );

VOID
NtfsSingleNonAlignedSync (
    IN PIRP_CONTEXT IrpContext,
    IN PVCB Vcb,
    IN PSCB Scb,
    IN PUCHAR Buffer,
    IN VBO Vbo,
    IN LBO Lbo,
    IN ULONG ByteCount,
    IN PIRP Irp
    );

NTSTATUS
NtfsEncryptBuffers (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN ULONG NumberRuns,
    IN PCOMPRESSION_CONTEXT CompressionContext
    );

VOID
NtfsFixDataError (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB Scb,
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP MasterIrp,
    IN ULONG MultipleIrpCount,
    IN PIO_RUN IoRuns,
    IN UCHAR IrpSpFlags
    );

VOID
NtfsPostHotFix(
    IN PIRP Irp,
    IN PLONGLONG BadVbo,
    IN LONGLONG BadLbo,
    IN ULONG ByteLength,
    IN BOOLEAN DelayIrpCompletion
    );

VOID
NtfsPerformHotFix (
    IN PIRP_CONTEXT IrpContext
    );

BOOLEAN
NtfsGetReservedBuffer (
    IN PFCB ThisFcb,
    OUT PVOID *Buffer,
    OUT PULONG Length,
    IN UCHAR Need2
    );

BOOLEAN
NtfsFreeReservedBuffer (
    IN PVOID Buffer
    );

LONG
NtfsDefragExceptionFilter (
    IN PIRP_CONTEXT IrpContext OPTIONAL,
    IN PEXCEPTION_POINTERS ExceptionPointer,
    IN OUT PULONG DeletePendingFailureCountsLeft
    );

#ifdef ALLOC_PRAGMA

#pragma alloc_text(PAGE, NtfsReadFromPlex)
#pragma alloc_text(PAGE, NtfsDefragFile)

#endif


INLINE
BOOLEAN
NtfsZeroEndOfBuffer (
    IN PIRP Irp,
    IN PNTFS_IO_CONTEXT Context
    )

/*++

Routine Description:

    This routine Zeros the end of an async transfer. Because the transfer is done
    in sector sized chunks there will be garbage data from the end of file size to
    the sector boundary.  If there are any errors they will stored in the IoStatus field
    of the irp. We're going to allow out of resource errors in this path because its async.
    Only the synchronous paging paths have a guarantee of fwd. progress

Arguments:

    Irp - Pointer to the Irp for which the buffer is to be zeroed

    Device - device which contains the vcb

    Context - io context which has the original operation bounds

Return Value:

    TRUE if successful

--*/

{
    PIO_STACK_LOCATION IrpSp;
    PDEVICE_OBJECT DeviceObject;
    PVCB Vcb;
    PVOID SystemBuffer;
    ULONG RoundedTransfer;
    UCHAR Buffer[sizeof( MDL ) + sizeof( PFN_NUMBER ) * (NTFS_MDL_TRANSFER_PAGES + 1)];
    PMDL PartialMdl = (PMDL) Buffer;

    IrpSp = IoGetCurrentIrpStackLocation( Irp );
    DeviceObject = IrpSp->DeviceObject;

    //
    //  Zero the difference between filesize and data read if necc. on reads
    //

    if ((IrpSp->MajorFunction == IRP_MJ_READ) &&
        (Context->Wait.Async.RequestedByteCount < IrpSp->Parameters.Read.Length)) {


        Vcb = &((PVOLUME_DEVICE_OBJECT) DeviceObject)->Vcb;

        ASSERT( Vcb->NodeTypeCode == NTFS_NTC_VCB );

        RoundedTransfer = BlockAlign( Context->Wait.Async.RequestedByteCount, (LONG)Vcb->BytesPerSector );

        if (RoundedTransfer > Context->Wait.Async.RequestedByteCount) {

            MmInitializeMdl( PartialMdl, NULL, NTFS_MDL_TRANSFER_PAGES * PAGE_SIZE );
            IoBuildPartialMdl( Irp->MdlAddress, PartialMdl, Add2Ptr( MmGetMdlBaseVa( Irp->MdlAddress ), MmGetMdlByteOffset( Irp->MdlAddress ) + Context->Wait.Async.RequestedByteCount ), RoundedTransfer - Context->Wait.Async.RequestedByteCount );

            //
            //  Now map that last page
            //

            SystemBuffer = MmGetSystemAddressForMdlSafe( PartialMdl, NormalPagePriority );
            if (SystemBuffer == NULL) {

                //
                //  We're an async path so we can return out of resources
                //

                Irp->IoStatus.Status = STATUS_INSUFFICIENT_RESOURCES;
                return FALSE;
            }

#ifdef BENL_DBG
//            KdPrint(( "NTFS: Zero %x %x %x\n", MmGetMdlByteOffset( Irp->MdlAddress ), RoundedTransfer, Context->Wait.Async.RequestedByteCount ));
#endif

            //
            //  Zero the end of the transfer between expected size and read size. If the mdl is not
            //  on a page boundary this will all be offset by the MdlByteOffset
            //

            RtlZeroMemory( SystemBuffer, RoundedTransfer - Context->Wait.Async.RequestedByteCount );
            MmPrepareMdlForReuse( PartialMdl );
        }
    }

    return TRUE;
}


VOID
NtfsLockUserBuffer (
    IN PIRP_CONTEXT IrpContext,
    IN OUT PIRP Irp,
    IN LOCK_OPERATION Operation,
    IN ULONG BufferLength
    )

/*++

Routine Description:

    This routine locks the specified buffer for the specified type of
    access.  The file system requires this routine since it does not
    ask the I/O system to lock its buffers for direct I/O.  This routine
    may only be called from the Fsd while still in the user context.

Arguments:

    Irp - Pointer to the Irp for which the buffer is to be locked.

    Operation - IoWriteAccess for read operations, or IoReadAccess for
                write operations.

    BufferLength - Length of user buffer.

Return Value:

    None

--*/

{
    PMDL Mdl = NULL;

    ASSERT_IRP_CONTEXT( IrpContext );
    ASSERT_IRP( Irp );

    if (Irp->MdlAddress == NULL) {

        //
        // Allocate the Mdl, and Raise if we fail.
        //

        Mdl = IoAllocateMdl( Irp->UserBuffer, BufferLength, FALSE, FALSE, Irp );

        if (Mdl == NULL) {

            NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
        }

        //
        //  Now probe the buffer described by the Irp.  If we get an exception,
        //  deallocate the Mdl and return the appropriate "expected" status.
        //

        try {

            MmProbeAndLockPages( Mdl, Irp->RequestorMode, Operation );

        } except(EXCEPTION_EXECUTE_HANDLER) {

            NTSTATUS Status;

            Status = GetExceptionCode();

            IoFreeMdl( Mdl );
            Irp->MdlAddress = NULL;

            NtfsRaiseStatus( IrpContext,
                             FsRtlIsNtstatusExpected(Status) ? Status : STATUS_INVALID_USER_BUFFER,
                             NULL,
                             NULL );
        }
    }

    //
    //  And return to our caller
    //

    return;
}


PVOID
NtfsMapUserBuffer (
    IN OUT PIRP Irp,
    IN MM_PAGE_PRIORITY Priority
    )

/*++

Routine Description:

    This routine conditionally maps the user buffer for the current I/O
    request in the specified mode.  If the buffer is already mapped, it
    just returns its address.

Arguments:

    Irp - Pointer to the Irp for the request.

    Priority - priority of the pages should be normalpagepriority unless its a metadata page
        in which case it can be high priority

Return Value:

    Mapped address

--*/

{
    PVOID SystemBuffer;

    //
    //  All paging i/o is high priority
    //

    if (FlagOn( Irp->Flags, IRP_PAGING_IO )) {
        Priority = HighPagePriority;
    }

    //
    // If there is no Mdl, then we must be in the Fsd, and we can simply
    // return the UserBuffer field from the Irp.
    //

    if (Irp->MdlAddress == NULL) {

        return Irp->UserBuffer;

    } else {

        //
        //  MM can return NULL if there are no system ptes.
        //

        if ((SystemBuffer = MmGetSystemAddressForMdlSafe( Irp->MdlAddress, Priority )) == NULL) {

            ExRaiseStatus( STATUS_INSUFFICIENT_RESOURCES );
        }

        return SystemBuffer;
    }
}


PVOID
NtfsMapUserBufferNoRaise (
    IN OUT PIRP Irp,
    IN MM_PAGE_PRIORITY Priority
    )

/*++

Routine Description:

    This routine conditionally maps the user buffer for the current I/O
    request in the specified mode.  If the buffer is already mapped, it
    just returns its address.

Arguments:

    Irp - Pointer to the Irp for the request.

    Priority - priority of the pages should be normalpagepriority unless its a metadata page
        in which case it can be high priority

Return Value:

    Mapped address

--*/

{
    //
    //  All paging i/o is high priority
    //

    if (FlagOn( Irp->Flags, IRP_PAGING_IO )) {
        Priority = HighPagePriority;
    }

    //
    // If there is no Mdl, then we must be in the Fsd, and we can simply
    // return the UserBuffer field from the Irp.
    //

    if (Irp->MdlAddress == NULL) {

        return Irp->UserBuffer;

    } else {

        //
        //  MM can return NULL if there are no system ptes.
        //

        return MmGetSystemAddressForMdlSafe( Irp->MdlAddress, Priority );
    }
}


VOID
NtfsFillIrpBuffer (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN ULONG ByteCount,
    IN ULONG Offset,
    IN UCHAR Pattern
    )
/*++

Routine Description:

    Fill a range in the buffer contained within an irp with a given pattern

Arguments:

    IrpContext - If present this an IrpContext put on the caller's stack
        to avoid having to allocate it from pool.

    Irp - Supplies the Irp being processed

    ByteCount - bytes to zero

    Offset - Offset within the irp's buffer to begin zeroing at

    Pattern - Pattern to fill the buffer with

Return Value:

    NTSTATUS - The FSD status for the IRP

--*/
{
    PVOID SystemBuffer;
    PVCB Vcb = IrpContext->Vcb;
    UCHAR Buffer[sizeof( MDL ) + sizeof( PFN_NUMBER ) * 2];
    PMDL PartialMdl = (PMDL) Buffer;
    ULONG FillCount = ByteCount;

    //
    //  First attempt to directly map the user's buffer
    //

    SystemBuffer = NtfsMapUserBufferNoRaise( Irp, NormalPagePriority );

    //
    //  If there weren't pte in the system cache we'll use the reserved mapping instead
    //

    if (!SystemBuffer) {

        ASSERT( Irp->MdlAddress != NULL );

        MmInitializeMdl( PartialMdl, NULL, 2 * PAGE_SIZE );
        ExAcquireFastMutexUnsafe( &Vcb->ReservedMappingMutex );

        while (FillCount > 0) {

            IoBuildPartialMdl( Irp->MdlAddress, PartialMdl, Add2Ptr( MmGetMdlBaseVa( Irp->MdlAddress ), MmGetMdlByteOffset( Irp->MdlAddress ) + Offset + ByteCount - FillCount ), min( PAGE_SIZE, FillCount ));
            SystemBuffer = MmMapLockedPagesWithReservedMapping( Vcb->ReservedMapping,
                                                                RESERVE_POOL_TAG,
                                                                PartialMdl,
                                                                MmCached );

            ASSERT( SystemBuffer != NULL );

            try {
                RtlFillMemory( SystemBuffer, min( PAGE_SIZE, FillCount), Pattern );
            } except( EXCEPTION_EXECUTE_HANDLER ) {

                MmUnmapReservedMapping( Vcb->ReservedMapping, RESERVE_POOL_TAG, PartialMdl );
                MmPrepareMdlForReuse( PartialMdl );
                ExReleaseFastMutexUnsafe( &Vcb->ReservedMappingMutex );
                NtfsRaiseStatus( IrpContext, STATUS_INVALID_USER_BUFFER, NULL, NULL );
            }

            if (FillCount >= PAGE_SIZE) {
                FillCount -= PAGE_SIZE;
            } else {
                FillCount = 0;
            }

            MmUnmapReservedMapping( Vcb->ReservedMapping, RESERVE_POOL_TAG, PartialMdl );
            MmPrepareMdlForReuse( PartialMdl );
            SystemBuffer = NULL;
        }

        ExReleaseFastMutexUnsafe( &Vcb->ReservedMappingMutex );

    } else {

        try {
            RtlFillMemory( Add2Ptr( SystemBuffer, Offset ), ByteCount, Pattern );
        } except( EXCEPTION_EXECUTE_HANDLER ) {
             NtfsRaiseStatus( IrpContext, STATUS_INVALID_USER_BUFFER, NULL, NULL );
        }
    }
}


NTSTATUS
NtfsVolumeDasdIo (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB DasdScb,
    IN PCCB Ccb,
    IN VBO StartingVbo,
    IN ULONG ByteCount
    )

/*++

Routine Description:

    This routine performs the non-cached disk io for Volume Dasd, as described
    in its parameters.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Supplies the DasdScb for the volume - we don't use vcb to find this
          since the vcb maybe dismounted

    Ccb - flag in it used to track whether to flush the volume

    StartingVbo - Starting offset within the file for the operation.

    ByteCount - The lengh of the operation.

Return Value:

    The result of the Io operation.  STATUS_PENDING if this is an asynchronous
    open.

--*/

{
    NTSTATUS Status = STATUS_SUCCESS;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );
    BOOLEAN AcquiredVcb = FALSE;
    BOOLEAN AcquiredScb = FALSE;
    LOGICAL Dismounted;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsVolumeDasdIo\n") );
    DebugTrace( 0, Dbg, ("Irp           = %08lx\n", Irp) );
    DebugTrace( 0, Dbg, ("MajorFunction = %08lx\n", IrpContext->MajorFunction) );
    DebugTrace( 0, Dbg, ("Vcb           = %08lx\n", DasdScb->Vcb) );
    DebugTrace( 0, Dbg, ("StartingVbo   = %016I64x\n", StartingVbo) );
    DebugTrace( 0, Dbg, ("ByteCount     = %08lx\n", ByteCount) );

    //
    //  Acquire the vcb if we'll flush based on the ccb flag - this test is
    //  safe since its off the ccb. Acquire the dasd handle shared otherwise
    //  use the appropriate object to capture the volume mount state
    //

    if (FlagOn( Ccb->Flags, CCB_FLAG_FLUSH_VOLUME_ON_IO )) {

        NtfsAcquireExclusiveVcb( IrpContext, DasdScb->Vcb, TRUE );
        Dismounted = !FlagOn( DasdScb->Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED );
        AcquiredVcb = TRUE;

    } else {

        if (!NtfsAcquireSharedScbWaitForEx( IrpContext, DasdScb )) {
            NtfsRaiseStatus( IrpContext, STATUS_CANT_WAIT, &DasdScb->Fcb->FileReference, DasdScb->Fcb );
        }
        Dismounted = FlagOn( DasdScb->ScbState, SCB_STATE_VOLUME_DISMOUNTED );
        AcquiredScb = TRUE;
    }

    try {

        //
        //  If this is the handle that locked the volume its still ok to use
        //  even if dismounted. We don't necc. own the vcb but since the volume is def. dismounted
        //  at this point if we aren't the handle in question either the value will be null or not us
        //  so we're ok in either case
        //

        if (Dismounted &&
            (ClearFlag( (ULONG_PTR)DasdScb->Vcb->FileObjectWithVcbLocked, 1 ) != (ULONG_PTR)IrpSp->FileObject)) {

            Status = STATUS_VOLUME_DISMOUNTED;
            leave;
        }

        //
        //  Do delayed volume flush if required
        //

        if (FlagOn( Ccb->Flags, CCB_FLAG_FLUSH_VOLUME_ON_IO )) {

            ASSERT( IrpContext->ExceptionStatus == STATUS_SUCCESS );

            //
            //  No need to purge or lock the volume while flushing. NtfsFlushVolume
            //  will acquire the vcb exclusive
            //

            Status = NtfsFlushVolume( IrpContext, DasdScb->Vcb, TRUE, FALSE, TRUE, FALSE );

            //
            //  Ignore corruption errors while flushing
            //

            if (!NT_SUCCESS( Status ) && (Status != STATUS_FILE_CORRUPT_ERROR)) {

                //
                //  Report the error that there is an data section blocking the flush by returning
                //  sharing violation.  Otherwise Win32 callers will get INVALID_PARAMETER.
                //

                if (Status == STATUS_UNABLE_TO_DELETE_SECTION) {
                    Status = STATUS_SHARING_VIOLATION;
                }

                NtfsRaiseStatus( IrpContext, Status, NULL, NULL );
            }

            ClearFlag( Ccb->Flags, CCB_FLAG_FLUSH_VOLUME_ON_IO );
        }

        //
        //  For nonbuffered I/O, we need the buffer locked in all
        //  cases.
        //
        //  This call may raise.  If this call succeeds and a subsequent
        //  condition is raised, the buffers are unlocked automatically
        //  by the I/O system when the request is completed, via the
        //  Irp->MdlAddress field.
        //

        NtfsLockUserBuffer( IrpContext,
                            Irp,
                            (IrpContext->MajorFunction == IRP_MJ_READ) ?
                            IoWriteAccess : IoReadAccess,
                            ByteCount );

        //
        //  Read or write the data
        //

        NtfsSingleAsync( IrpContext,
                         DasdScb->Vcb->TargetDeviceObject,
                         StartingVbo,
                         ByteCount,
                         Irp,
                         IrpContext->MajorFunction,
                         0 );

        if (!FlagOn( IrpContext->State, IRP_CONTEXT_STATE_WAIT )) {

            //
            //  We can get rid of the IrpContext now.
            //

            IrpContext->Union.NtfsIoContext = NULL;
            NtfsCleanupIrpContext( IrpContext, TRUE );

            DebugTrace( -1, Dbg, ("NtfsVolumeDasdIo -> STATUS_PENDING\n") );
            Status = STATUS_PENDING;
            leave;
        }

        //
        //  Wait for the result
        //

        NtfsWaitSync( IrpContext );

        Status = Irp->IoStatus.Status;

        DebugTrace( -1, Dbg, ("NtfsVolumeDasdIo -> %08lx\n", Irp->IoStatus.Status) );
    } finally {

        if (AcquiredVcb) {
            NtfsReleaseVcb( IrpContext, DasdScb->Vcb );
        }
        if (AcquiredScb) {
            NtfsReleaseScb( IrpContext, DasdScb );
        }
    }

    return Status;
}


VOID
NtfsPagingFileIoWithNoAllocation (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN ULONG ByteCount
    )

/*++

Routine Description:

    This routine performs the non-cached disk io described in its parameters.
    This routine nevers blocks, and should only be used with the paging
    file since no completion processing is performed. This version does not allocate
    any memory so it guarantees fwd progress

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Supplies the file to act on.

    StartingVbo - Starting offset within the file for the operation.

    ByteCount - The lengh of the operation.

Return Value:

    None.

--*/

{
    UCHAR Buffer[sizeof( MDL ) + sizeof( PFN_NUMBER ) * (NTFS_MDL_TRANSFER_PAGES + 1)];
    PMDL PartialMdl = (PMDL) Buffer;
    PMDL MasterMdl = Irp->MdlAddress;

    LONGLONG ThisClusterCount;
    ULONG ThisByteCount;
    LCN ThisLcn;
    LBO ThisLbo;
    VCN ThisVcn;

    PIO_STACK_LOCATION IrpSp;
    ULONG BufferOffset;

    PVCB Vcb = Scb->Vcb;

    ULONG ClusterOffset;
    VCN BeyondLastCluster;

    NTSTATUS Status;
    KEVENT Event;

    //
    //  Initialize some locals.
    //

    BufferOffset = 0;
    ClusterOffset = (ULONG) StartingVbo & Vcb->ClusterMask;
    BeyondLastCluster = LlClustersFromBytes( Vcb, StartingVbo + ByteCount );
    KeInitializeEvent( &Event, SynchronizationEvent, FALSE );
    RtlZeroMemory( Buffer, sizeof( Buffer ) );
    ThisVcn = LlClustersFromBytesTruncate( Vcb, StartingVbo );

    while (ByteCount > 0) {

        //
        //  Try to lookup the next run
        //  Paging files reads/ writes should always be correct.  If
        //  we didn't find the allocation, something bad has happened.
        //

        if (!NtfsLookupNtfsMcbEntry( &Scb->Mcb,
                                     ThisVcn,
                                     &ThisLcn,
                                     &ThisClusterCount,
                                     NULL,
                                     NULL,
                                     NULL,
                                     NULL )) {;

            NtfsBugCheck( 0, 0, 0 );
        }

        //
        //  Adjust from Lcn to Lbo.
        //

        ThisLbo = LlBytesFromClusters( Vcb, ThisLcn ) + ClusterOffset;

        //
        // If next run is larger than we need, "ya get what you need".
        //

        ThisByteCount = BytesFromClusters( Vcb, (ULONG) ThisClusterCount ) - ClusterOffset;
        if (ThisVcn + ThisClusterCount >= BeyondLastCluster) {

            ThisByteCount = ByteCount;
        }

        //
        //  Now that we have properly bounded this piece of the
        //  transfer, it is time to read/write it NTFS_MDL_TRANSFER_PAGES pages at a time.
        //

        while (ThisByteCount > 0) {

            ULONG TransferSize = min( NTFS_MDL_TRANSFER_PAGES * PAGE_SIZE, ThisByteCount );

            //
            //  The partial mdl is on the stack
            //

            PartialMdl->Size = sizeof( Buffer );
            IoBuildPartialMdl( MasterMdl,
                               PartialMdl,
                               Add2Ptr( Irp->UserBuffer, BufferOffset ),
                               TransferSize );

            Irp->MdlAddress = PartialMdl;
            IrpSp = IoGetNextIrpStackLocation( Irp );

            //
            //  Setup the Stack location to do a read from the disk driver.
            //

            IrpSp->MajorFunction = IrpContext->MajorFunction;
            IrpSp->Parameters.Read.Length = TransferSize;
            IrpSp->Parameters.Read.ByteOffset.QuadPart = ThisLbo;

            IoSetCompletionRoutine( Irp, NtfsPagingFileNoAllocCompletionRoutine, &Event, TRUE, TRUE, TRUE );

            Status = IoCallDriver( Vcb->TargetDeviceObject, Irp );
            if (Status == STATUS_PENDING) {

                KeWaitForSingleObject( &Event, Executive, KernelMode, FALSE, NULL );
                Status = Irp->IoStatus.Status;

            }

            ASSERT( Status != STATUS_INSUFFICIENT_RESOURCES );

            if (!FT_SUCCESS( Irp->IoStatus.Status )) {

                BOOLEAN DataLost = TRUE;

                if (!FsRtlIsTotalDeviceFailure( Status ) &&
                    (Status != STATUS_VERIFY_REQUIRED)) {

                    //
                    //  We don't want to try to hotfix READ errors on the paging file
                    //  because of deadlock possibilities with MM. Instead we'll just
                    //  return the error for MM to deal with. Chances are that
                    //  MM (eg. MiWaitForInPageComplete) will bugcheck anyway,
                    //  but it's still nicer than walking right into the deadlock.
                    //

                    if (IrpSp->MajorFunction != IRP_MJ_READ) {

                        if ((Irp->IoStatus.Status == STATUS_FT_READ_RECOVERY_FROM_BACKUP) ||
                            (Irp->IoStatus.Status == STATUS_FT_WRITE_RECOVERY)) {

                            //
                            //  We got the data down on part of the mirror so we can do the fix
                            //  asynchronously
                            //

                            DataLost = FALSE;
                        }

                        //
                        //  Start an async hotfix
                        //

                        try {

                            NtfsPostHotFix( Irp,
                                            &StartingVbo,
                                            ThisLbo,
                                            TransferSize,
                                            FALSE );

                        } except( GetExceptionCode() == STATUS_INSUFFICIENT_RESOURCES ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {

                            //
                            //  If we don't have enough memory to post the hotfix - so be it
                            //  continue on
                            //

                            NtfsMinimumExceptionProcessing( IrpContext );
                        }
                    }
                }

                //
                //  If mm needs to rewrite the data return back the error
                //

                if (DataLost) {
                    Irp->MdlAddress = MasterMdl;
                    NtfsCompleteRequest( NULL, Irp, Irp->IoStatus.Status );
                    return;
                }
            }

            //
            //  Now adjust everything for the next pass through the loop
            //

            StartingVbo += TransferSize;
            BufferOffset += TransferSize;
            ByteCount -= TransferSize;
            ThisByteCount -= TransferSize;
            ThisLbo += TransferSize;
        }

        //
        //  Now adjust everything for the next pass through the loop but
        //  break out now if all the irps have been created for the io.
        //

        ClusterOffset = 0;
        ThisVcn += ThisClusterCount;
    }

    //
    //  Finally restore back the fields and complete the original irp
    //

    Irp->MdlAddress = MasterMdl;
    NtfsCompleteRequest( NULL, Irp, Irp->IoStatus.Status );
}


VOID
NtfsPagingFileIo (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN ULONG ByteCount
    )

/*++

Routine Description:

    This routine performs the non-cached disk io described in its parameters.
    This routine nevers blocks, and should only be used with the paging
    file since no completion processing is performed.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Supplies the file to act on.

    StartingVbo - Starting offset within the file for the operation.

    ByteCount - The lengh of the operation.

Return Value:

    None.

--*/

{
    LONGLONG ThisClusterCount;
    ULONG ThisByteCount;

    LCN ThisLcn;
    LBO ThisLbo;

    VCN ThisVcn;

    PIRP AssocIrp;
    PIRP ContextIrp;
    PIO_STACK_LOCATION IrpSp;
    ULONG BufferOffset;
    PDEVICE_OBJECT DeviceObject;
    PFILE_OBJECT FileObject;
    PDEVICE_OBJECT OurDeviceObject;

    PVCB Vcb = Scb->Vcb;

    LIST_ENTRY AssociatedIrps;
    ULONG AssociatedIrpCount;

    ULONG ClusterOffset;
    VCN BeyondLastCluster;

    VBO OriginalStartingVbo = StartingVbo;
    ULONG OriginalByteCount = ByteCount;

    ClearFlag( Vcb->Vpb->RealDevice->Flags, DO_VERIFY_VOLUME ); //****ignore verify for now

    //
    //  Check whether we want to set the low order bit in the Irp to pass
    //  as a context value to the completion routine.
    //

    ContextIrp = Irp;

    if (FlagOn( IrpContext->Flags, IRP_CONTEXT_FLAG_HOTFIX_UNDERWAY )) {

        SetFlag( ((ULONG_PTR) ContextIrp), 0x1 );
    }

    //
    //  Check that we are sector aligned.
    //

    ASSERT( (((ULONG)StartingVbo) & (Vcb->BytesPerSector - 1)) == 0 );

    //
    //  Initialize some locals.
    //

    BufferOffset = 0;
    ClusterOffset = (ULONG) StartingVbo & Vcb->ClusterMask;
    DeviceObject = Vcb->TargetDeviceObject;
    BeyondLastCluster = LlClustersFromBytes( Vcb, StartingVbo + ByteCount );

    //
    //  Try to lookup the first run.  If there is just a single run,
    //  we may just be able to pass it on.
    //

    ThisVcn = LlClustersFromBytesTruncate( Vcb, StartingVbo );

    //
    //  Paging files reads/ writes should always be correct.  If we didn't
    //  find the allocation, something bad has happened.
    //

    if (!NtfsLookupNtfsMcbEntry( &Scb->Mcb,
                                 ThisVcn,
                                 &ThisLcn,
                                 &ThisClusterCount,
                                 NULL,
                                 NULL,
                                 NULL,
                                 NULL )) {

        NtfsRaiseStatus( IrpContext, STATUS_FILE_CORRUPT_ERROR, NULL, Scb->Fcb );
    }

    //
    //  Adjust from Lcn to Lbo.
    //

    ThisLbo = LlBytesFromClusters( Vcb, ThisLcn ) + ClusterOffset;

    //
    //  Now set up the Irp->IoStatus.  It will be modified by the
    //  multi-completion routine in case of error or verify required.
    //

    Irp->IoStatus.Status = STATUS_SUCCESS;
    Irp->IoStatus.Information = ByteCount;

    //
    //  Save the FileObject.
    //

    IrpSp = IoGetCurrentIrpStackLocation( Irp );
    FileObject = IrpSp->FileObject;
    OurDeviceObject = IrpSp->DeviceObject;

    //
    //  See if the write covers a single valid run, and if so pass
    //  it on.
    //

    if (ThisVcn + ThisClusterCount >= BeyondLastCluster) {

        DebugTrace( 0, Dbg, ("Passing Irp on to Disk Driver\n") );

        //
        //  We use our stack location to store request information in a
        //  rather strange way, to give us enough context to post a
        //  hot fix on error.  It's ok, because it is our stack location!
        //

        IrpSp->Parameters.Read.ByteOffset.QuadPart = ThisLbo;
        IrpSp->Parameters.Read.Key = ((ULONG)StartingVbo);

        //
        //  Set up the completion routine address in our stack frame.
        //  This is only invoked on error or cancel, and just copies
        //  the error Status into master irp's iosb.
        //

        IoSetCompletionRoutine( Irp,
                                &NtfsPagingFileCompletionRoutine,
                                ContextIrp,
                                (BOOLEAN)!FlagOn(Vcb->VcbState, VCB_STATE_NO_SECONDARY_AVAILABLE),
                                TRUE,
                                TRUE );

        //
        //  Setup the next IRP stack location for the disk driver beneath us.
        //

        IrpSp = IoGetNextIrpStackLocation( Irp );

        //
        //  Setup the Stack location to do a read from the disk driver.
        //

        IrpSp->MajorFunction = IrpContext->MajorFunction;
        IrpSp->Parameters.Read.Length = ByteCount;
        IrpSp->Parameters.Read.ByteOffset.QuadPart = ThisLbo;

        //
        //  Issue the read/write request
        //
        //  If IoCallDriver returns an error, it has completed the Irp
        //  and the error will be dealt with as a normal IO error.
        //

        (VOID)IoCallDriver( DeviceObject, Irp );

        DebugTrace( -1, Dbg, ("NtfsPagingFileIo -> VOID\n") );
        return;
    }

    //
    //  Loop while there are still byte writes to satisfy.  Always keep the
    //  associated irp count one up, so that the master irp won't get
    //  completed prematurly.
    //

    try {

        //
        //  We will allocate and initialize all of the Irps and then send
        //  them down to the driver.  We will queue them off of our
        //  AssociatedIrp queue.
        //

        InitializeListHead( &AssociatedIrps );
        AssociatedIrpCount = 0;

        while (TRUE) {

            //
            //  Reset this for unwinding purposes
            //

            AssocIrp = NULL;

            //
            //  If next run is larger than we need, "ya get what you need".
            //

            ThisByteCount = BytesFromClusters( Vcb, (ULONG) ThisClusterCount ) - ClusterOffset;
            if (ThisVcn + ThisClusterCount >= BeyondLastCluster) {

                ThisByteCount = ByteCount;
            }

            //
            //  Now that we have properly bounded this piece of the
            //  transfer, it is time to read/write it.
            //

            AssocIrp = IoMakeAssociatedIrp( Irp, (CCHAR)(DeviceObject->StackSize + 1) );

            if (AssocIrp == NULL) {
                break;
            }

            //
            //  Now add the Irp to our queue of Irps.
            //

            InsertTailList( &AssociatedIrps, &AssocIrp->Tail.Overlay.ListEntry );

            //
            //  Allocate and build a partial Mdl for the request.
            //

            {
                PMDL Mdl;

                Mdl = IoAllocateMdl( (PCHAR)Irp->UserBuffer + BufferOffset,
                                     ThisByteCount,
                                     FALSE,
                                     FALSE,
                                     AssocIrp );

                if (Mdl == NULL) {
                    break;
                }

                IoBuildPartialMdl( Irp->MdlAddress,
                                   Mdl,
                                   Add2Ptr( Irp->UserBuffer, BufferOffset ),
                                   ThisByteCount );
            }

            AssociatedIrpCount += 1;

            //
            //  Get the first IRP stack location in the associated Irp
            //

            IoSetNextIrpStackLocation( AssocIrp );
            IrpSp = IoGetCurrentIrpStackLocation( AssocIrp );

            //
            //  We use our stack location to store request information in a
            //  rather strange way, to give us enough context to post a
            //  hot fix on error.  It's ok, because it is our stack location!
            //

            IrpSp->MajorFunction = IrpContext->MajorFunction;
            IrpSp->Parameters.Read.Length = ThisByteCount;
            IrpSp->Parameters.Read.ByteOffset.QuadPart = ThisLbo;
            IrpSp->Parameters.Read.Key = ((ULONG)StartingVbo);
            IrpSp->FileObject = FileObject;
            IrpSp->DeviceObject = OurDeviceObject;

            //
            //  Set up the completion routine address in our stack frame.
            //  This is only invoked on error or cancel, and just copies
            //  the error Status into master irp's iosb.
            //

            IoSetCompletionRoutine( AssocIrp,
                                    &NtfsPagingFileCompletionRoutine,
                                    ContextIrp,
                                    (BOOLEAN)!FlagOn(Vcb->VcbState, VCB_STATE_NO_SECONDARY_AVAILABLE),
                                    TRUE,
                                    TRUE );

            //
            //  Setup the next IRP stack location in the associated Irp for the disk
            //  driver beneath us.
            //

            IrpSp = IoGetNextIrpStackLocation( AssocIrp );

            //
            //  Setup the Stack location to do a read from the disk driver.
            //

            IrpSp->MajorFunction = IrpContext->MajorFunction;
            IrpSp->Parameters.Read.Length = ThisByteCount;
            IrpSp->Parameters.Read.ByteOffset.QuadPart = ThisLbo;

            //
            //  Now adjust everything for the next pass through the loop but
            //  break out now if all the irps have been created for the io.
            //

            StartingVbo += ThisByteCount;
            BufferOffset += ThisByteCount;
            ByteCount -= ThisByteCount;
            ClusterOffset = 0;
            ThisVcn += ThisClusterCount;


            if (ByteCount == 0) {

                break;
            }

            //
            //  Try to lookup the next run (if we are not done).
            //  Paging files reads/ writes should always be correct.  If
            //  we didn't find the allocation, something bad has happened.
            //

            if (!NtfsLookupNtfsMcbEntry( &Scb->Mcb,
                                         ThisVcn,
                                         &ThisLcn,
                                         &ThisClusterCount,
                                         NULL,
                                         NULL,
                                         NULL,
                                         NULL )) {;

                NtfsBugCheck( 0, 0, 0 );
            }

            ThisLbo = LlBytesFromClusters( Vcb, ThisLcn );

        } //  while (ByteCount != 0)

        if (ByteCount == 0) {

            //
            //  We have now created all of the Irps that we need.  We will set the
            //  Irp count in the master Irp and then fire off the associated irps.
            //

            Irp->AssociatedIrp.IrpCount = AssociatedIrpCount;

            while (!IsListEmpty( &AssociatedIrps )) {

                AssocIrp = CONTAINING_RECORD( AssociatedIrps.Flink,
                                              IRP,
                                              Tail.Overlay.ListEntry );

                RemoveHeadList( &AssociatedIrps );

                (VOID) IoCallDriver( DeviceObject, AssocIrp );
            }
        } else {

            NtfsPagingFileIoWithNoAllocation( IrpContext, Irp, Scb, OriginalStartingVbo, OriginalByteCount );
        }
    } finally {

        DebugUnwind( NtfsPagingFileIo );

        //
        //  In the case of an error we must clean up any of the associated Irps
        //  we have created.
        //

        while (!IsListEmpty( &AssociatedIrps )) {

            AssocIrp = CONTAINING_RECORD( AssociatedIrps.Flink,
                                          IRP,
                                          Tail.Overlay.ListEntry );

            RemoveHeadList( &AssociatedIrps );

            if (AssocIrp->MdlAddress != NULL) {

                IoFreeMdl( AssocIrp->MdlAddress );
                AssocIrp->MdlAddress = NULL;
            }

            IoFreeIrp( AssocIrp );
        }
    }

    DebugTrace( -1, Dbg, ("NtfsPagingFileIo -> VOID\n") );
    return;
}


BOOLEAN
NtfsIsReadAheadThread (
    )

/*++

Routine Description:

    This routine returns whether the current thread is doing read ahead.

Arguments:

    None

Return Value:

    FALSE - if the thread is not doing read ahead
    TRUE - if the thread is doing read ahead

--*/

{
    PREAD_AHEAD_THREAD ReadAheadThread;
    PVOID CurrentThread = PsGetCurrentThread();
    KIRQL OldIrql;

    OldIrql = KeAcquireQueuedSpinLock( LockQueueNtfsStructLock );

    ReadAheadThread = (PREAD_AHEAD_THREAD)NtfsData.ReadAheadThreads.Flink;

    //
    //  Scan for our thread, stopping at the end of the list or on the first
    //  NULL.  We can stop on the first NULL, since when we free an entry
    //  we move it to the end of the list.
    //

    while ((ReadAheadThread != (PREAD_AHEAD_THREAD)&NtfsData.ReadAheadThreads) &&
           (ReadAheadThread->Thread != NULL)) {

        //
        //  Get out if we see our thread.
        //

        if (ReadAheadThread->Thread == CurrentThread) {

            KeReleaseQueuedSpinLock( LockQueueNtfsStructLock, OldIrql );
            return TRUE;
        }
        ReadAheadThread = (PREAD_AHEAD_THREAD)ReadAheadThread->Links.Flink;
    }

    KeReleaseQueuedSpinLock( LockQueueNtfsStructLock, OldIrql );
    return FALSE;
}


//
//  Internal support routine
//

VOID
NtfsAllocateCompressionBuffer (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB ThisScb,
    IN PIRP Irp,
    IN PCOMPRESSION_CONTEXT CompressionContext,
    IN OUT PULONG CompressionBufferLength
    )

/*++

Routine Description:

    This routine allocates a compression buffer of the desired length, and
            describes it with an Mdl.  It updates the Irp to describe the new buffer.
    Note that whoever allocates the CompressionContext must initially zero it.

Arguments:

    ThisScb - The stream where the IO is taking place.

    Irp - Irp for the current request

    CompressionContext - Pointer to the compression context for the request.

    CompressionBufferLength - Supplies length required for the compression buffer.
                              Returns length available.

Return Value:

    None.

--*/

{
    PMDL Mdl;

    //
    //  If no compression buffer is allocated, or it is too small, then we must
    //  take action here.
    //

    if (*CompressionBufferLength > CompressionContext->CompressionBufferLength) {

        //
        //  If there already is an Mdl, then there must also be a compression
        //  buffer (since we are part of main-line processing), and we must
        //  free these first.
        //

        if (CompressionContext->SavedMdl != NULL) {

            //
            //  Restore the byte count for which the Mdl was created, and free it.
            //

            Irp->MdlAddress->ByteCount = CompressionContext->CompressionBufferLength;

            NtfsDeleteMdlAndBuffer( Irp->MdlAddress,
                                    CompressionContext->CompressionBuffer );

            //
            //  Restore the Mdl and UserBuffer fields in the Irp.
            //

            Irp->MdlAddress = CompressionContext->SavedMdl;
            Irp->UserBuffer = CompressionContext->SavedUserBuffer;
            CompressionContext->SavedMdl = NULL;
            CompressionContext->CompressionBuffer = NULL;
        }

        CompressionContext->CompressionBufferLength = *CompressionBufferLength;

        //
        //  Allocate the compression buffer or raise
        //

        NtfsCreateMdlAndBuffer( IrpContext,
                                ThisScb,
                                (UCHAR) ((IrpContext->MajorFunction == IRP_MJ_WRITE) ?
                                         RESERVED_BUFFER_TWO_NEEDED :
                                         RESERVED_BUFFER_ONE_NEEDED),
                                &CompressionContext->CompressionBufferLength,
                                &Mdl,
                                &CompressionContext->CompressionBuffer );

        //
        //  Finally save the Mdl and Buffer fields from the Irp, and replace
        //  with the ones we just allocated.
        //

        CompressionContext->SavedMdl = Irp->MdlAddress;
        CompressionContext->SavedUserBuffer = Irp->UserBuffer;
        Irp->MdlAddress = Mdl;
        Irp->UserBuffer = CompressionContext->CompressionBuffer;
    }

    //
    //  Update the caller's length field in all cases.
    //

    *CompressionBufferLength = CompressionContext->CompressionBufferLength;
}


//
//  Internal support routine
//

VOID
NtfsDeallocateCompressionBuffer (
    IN PIRP Irp,
    IN PCOMPRESSION_CONTEXT CompressionContext,
    IN BOOLEAN Reinitialize
    )

/*++

Routine Description:

    This routine peforms all necessary cleanup for a compressed I/O, as described
    by the compression context.

Arguments:

    Irp - Irp for the current request

    CompressionContext - Pointer to the compression context for the request.

    Reinitialize - TRUE if we plan to continue using this context.

Return Value:

    None.

--*/

{
    //
    //  If there is a saved mdl, then we have to restore the original
    //  byte count it was allocated with and free it.  Then restore the
    //  Irp fields we modified.
    //

    if (CompressionContext->SavedMdl != NULL) {

        Irp->MdlAddress->ByteCount = CompressionContext->CompressionBufferLength;

        NtfsDeleteMdlAndBuffer( Irp->MdlAddress,
                                CompressionContext->CompressionBuffer );

    } else {

        NtfsDeleteMdlAndBuffer( NULL,
                                CompressionContext->CompressionBuffer );
    }

    //
    //  If there is a saved mdl, then we have to restore the original
    //  byte count it was allocated with and free it.  Then restore the
    //  Irp fields we modified.
    //

    if (CompressionContext->SavedMdl != NULL) {

        Irp->MdlAddress = CompressionContext->SavedMdl;
        Irp->UserBuffer = CompressionContext->SavedUserBuffer;
    }

    //
    //  If there is a work space structure allocated, free it.
    //

    if (CompressionContext->WorkSpace != NULL) {

        NtfsDeleteMdlAndBuffer( NULL, CompressionContext->WorkSpace );
    }

    //
    //  If are reinitializing the structure then clear the fields which
    //  we have already cleaned up.
    //

    if (Reinitialize) {

        CompressionContext->SavedMdl = NULL;
        CompressionContext->SavedUserBuffer = NULL;
        CompressionContext->CompressionBuffer = NULL;
        CompressionContext->WorkSpace = NULL;
        CompressionContext->CompressionBufferLength = 0;

    //
    //  Delete any allocate IoRuns array if we are done.
    //

    } else if (CompressionContext->AllocatedRuns != NTFS_MAX_PARALLEL_IOS) {
        NtfsFreePool( CompressionContext->IoRuns );
    }
}


//
//  Internal support routine
//

LONG
NtfsCompressionFilter (
    IN PIRP_CONTEXT IrpContext,
    IN PEXCEPTION_POINTERS ExceptionPointer
    )

{
    UNREFERENCED_PARAMETER( IrpContext );
    UNREFERENCED_PARAMETER( ExceptionPointer );

    ASSERT( FsRtlIsNtstatusExpected( ExceptionPointer->ExceptionRecord->ExceptionCode ) );
    return EXCEPTION_EXECUTE_HANDLER;
}


//
//  Internal support routine
//

ULONG
NtfsPrepareBuffers (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN PVBO StartingVbo,
    IN ULONG ByteCount,
    IN ULONG StreamFlags,
    IN OUT PBOOLEAN Wait,
    OUT PULONG NumberRuns,
    OUT PCOMPRESSION_CONTEXT CompressionContext
    )

/*++

Routine Description:

    This routine prepares the buffers for a noncached transfer, and fills
    in the IoRuns array to describe all of the separate transfers which must
    take place.

    For compressed reads, the exact size of the compressed data is
    calculated by scanning the run information, and a buffer is allocated
    to receive the compressed data.

    For compressed writes, an estimate is made on how large of a compressed
    buffer will be required.  Then the compression is performed, as much as
    possible, into the compressed buffer which was allocated.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Supplies the stream file to act on.

    StartingVbo - The starting point for the operation.

    ByteCount - The lengh of the operation.

    NumberRuns - Returns the number of runs filled in to the IoRuns array.

    CompressionContext - Returns information related to the compression
                         to be cleaned up after the transfer.

    StreamFlags - Supplies either 0 or some combination of COMPRESSED_STREAM
                  and ENCRYPTED_STREAM

Return Value:

    Returns uncompressed bytes remaining to be processed, or 0 if all buffers
    are prepared in the IoRuns and CompressionContext.

--*/

{
    PVOID RangePtr;
    ULONG Index;

    LBO NextLbo;
    LCN NextLcn;
    VBO TempVbo;

    ULONG NextLcnOffset;

    VCN StartingVcn;

    ULONG NextByteCount;
    ULONG ReturnByteCount;
    ULONG TrimmedByteCount;
    LONGLONG NextClusterCount;

    BOOLEAN NextIsAllocated;
    BOOLEAN SparseWrite = FALSE;
    BOOLEAN OriginalSparseWrite = FALSE;

    ULONG BufferOffset;

    ULONG StructureSize;
    ULONG UsaOffset;
    ULONG BytesInIoRuns;
    BOOLEAN StopForUsa;

    PVOID SystemBuffer;

    ULONG CompressionUnit, CompressionUnitInClusters;
    ULONG CompressionUnitOffset;
    ULONG CompressedSize, FinalCompressedSize;
    LONGLONG FinalCompressedClusters;
    ULONG LastStartUsaIoRun;
    LOGICAL ReadRequest;
    PIO_STACK_LOCATION IrpSp;

    PIO_RUN IoRuns;

    NTSTATUS Status;

    VBO StartVbo = *StartingVbo;
    PVCB Vcb = Scb->Vcb;

    PAGED_CODE();

    //
    //  Initialize some locals.
    //

    IoRuns = CompressionContext->IoRuns;
    *NumberRuns = 0;
    IrpSp = IoGetCurrentIrpStackLocation(Irp);
    ReadRequest = (LOGICAL)((IrpContext->MajorFunction == IRP_MJ_READ) ||
                            ((IrpContext->MajorFunction == IRP_MJ_FILE_SYSTEM_CONTROL) &&
                             (IrpContext->MinorFunction == IRP_MN_USER_FS_REQUEST) &&
                             (IrpSp->Parameters.FileSystemControl.FsControlCode == FSCTL_READ_FROM_PLEX)));

    //
    // For nonbuffered I/O, we need the buffer locked in all
    // cases.
    //
    // This call may raise.  If this call succeeds and a subsequent
    // condition is raised, the buffers are unlocked automatically
    // by the I/O system when the request is completed, via the
    // Irp->MdlAddress field.
    //

    ASSERT( FIELD_OFFSET(IO_STACK_LOCATION, Parameters.Read.Length) ==
            FIELD_OFFSET(IO_STACK_LOCATION, Parameters.Write.Length) );

    NtfsLockUserBuffer( IrpContext,
                        Irp,
                        ReadRequest ?
                          IoWriteAccess : IoReadAccess,
                        IrpSp->Parameters.Read.Length );

    //
    //  Normally the Mdl BufferOffset picks up from where we last left off.
    //  However, for those cases where we have called NtfsAllocateCompressionBuffer,
    //  for a scratch buffer, we always reset to offset 0.
    //

    BufferOffset = CompressionContext->SystemBufferOffset;
    if (CompressionContext->SavedMdl != NULL) {
        BufferOffset = 0;
    }

    //
    //  Check if this request wants to drive the IO directly from the Mcb.  This is
    //  the case for all Scb's without a compression unit or for reads of uncompressed
    //  files or compressed reads.  Also proceed with sparse writes optimistically
    //  assuming the compression unit is allocated.
    //

    if ((ReadRequest) ?

         //
         //  Trust Mcb on reads of uncompressed files or reading compressed data.
         //

         ((Scb->CompressionUnit == 0) ||
          !FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK ) ||
          FlagOn( StreamFlags, COMPRESSED_STREAM )) :

         //
         //  Trust Mcb (optimistically) for writes of uncompressed sparse files.
         //

         ((Scb->CompressionUnit == 0) ||
          (OriginalSparseWrite =
           SparseWrite = FlagOn( Scb->AttributeFlags,
                                  ATTRIBUTE_FLAG_COMPRESSION_MASK | ATTRIBUTE_FLAG_SPARSE ) == ATTRIBUTE_FLAG_SPARSE))) {

        ASSERT( (ReadRequest) ||
                (IrpContext->MajorFunction == IRP_MJ_WRITE) ||
                FlagOn( StreamFlags, COMPRESSED_STREAM ) );

        ASSERT( (Scb->CompressionUnit == 0) ||
                NtfsIsTypeCodeCompressible( Scb->AttributeTypeCode ) );

        //
        //  If this is a Usa-protected structure and we are reading, figure out
        //  what units we want to access it in.
        //

        TrimmedByteCount = 0;

        if ((Scb->EncryptionContext != NULL) &&
            (IrpContext->MajorFunction == IRP_MJ_WRITE)) {

            //
            //  For an encrypted file, we will be allocating a new buffer in the irp
            //  so the entries in the ioruns array should have offsets relative to
            //  this new buffer.
            //

            if (ByteCount > LARGE_BUFFER_SIZE) {

                //
                //  Trim to LARGE_BUFFER_SIZE and remember the amount trimmed
                //  to add back to byte count later.
                //

                TrimmedByteCount = ByteCount - LARGE_BUFFER_SIZE;
                ByteCount = LARGE_BUFFER_SIZE;

                DebugTrace( 0, Dbg, ("\nTrimming ByteCount by %x", TrimmedByteCount) );
            }
        }

        StructureSize = ByteCount;
        if (FlagOn(Scb->ScbState, SCB_STATE_USA_PRESENT) &&
            (ReadRequest)) {

            //
            //  Get the the number of blocks, based on what type of stream it is.
            //  First check for Mft or Log file.
            //

            if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_MFT) {

                ASSERT((Scb == Vcb->MftScb) || (Scb == Vcb->Mft2Scb));

                StructureSize = Vcb->BytesPerFileRecordSegment;

            //
            //  Otherwise it is an index, so we can get the count out of the Scb.
            //

            } else if (Scb->Header.NodeTypeCode != NTFS_NTC_SCB_DATA) {

                StructureSize = Scb->ScbType.Index.BytesPerIndexBuffer;
            }

            //
            //  Remember the last index in the IO runs array which will allow us to
            //  read in a full USA structure in the worst case.
            //

            LastStartUsaIoRun = ClustersFromBytes( Vcb, StructureSize );

            if (LastStartUsaIoRun > NTFS_MAX_PARALLEL_IOS) {

                LastStartUsaIoRun = 0;

            } else {

                LastStartUsaIoRun = NTFS_MAX_PARALLEL_IOS - LastStartUsaIoRun;
            }
        }

        BytesInIoRuns = 0;
        UsaOffset = 0;
        StopForUsa = FALSE;

        while ((ByteCount != 0) && (*NumberRuns != NTFS_MAX_PARALLEL_IOS) && !StopForUsa) {

            //
            //  Lookup next run
            //

            StartingVcn = LlClustersFromBytesTruncate( Vcb, StartVbo );

            //
            //  If another writer is modifying the Mcb of a sparse file then we need
            //  to serialize our lookup.
            //

            if (FlagOn( Scb->ScbState, SCB_STATE_PROTECT_SPARSE_MCB )) {

                NtfsPurgeFileRecordCache( IrpContext );
                NtfsAcquireSharedScb( IrpContext, Scb );

                try {

                    NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                            Scb,
                                                            StartingVcn,
                                                            &NextLcn,
                                                            &NextClusterCount,
                                                            NULL,
                                                            NULL );

                } finally {

                    NtfsReleaseScb( IrpContext, Scb );
                }

            } else {

                //
                //  Purge because lookupallocation may acquire the scb main if it needs to load
                //  which will be first main acquire and can be blocked behind an acquireallfiles
                //

                NtfsPurgeFileRecordCache( IrpContext );
                NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                        Scb,
                                                        StartingVcn,
                                                        &NextLcn,
                                                        &NextClusterCount,
                                                        NULL,
                                                        NULL );
            }

            ASSERT( NextIsAllocated ||
                    FlagOn( Vcb->VcbState, VCB_STATE_RESTART_IN_PROGRESS ) ||
                    FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_SPARSE ) ||
                    (Scb == Vcb->MftScb) ||
                    FlagOn( StreamFlags, COMPRESSED_STREAM | ENCRYPTED_STREAM ) );

            //
            //  If this is a sparse write we need to deal with cases where
            //  the run is not allocated OR the last run in this transfer
            //  was unallocated but this run is allocated.
            //

            if (SparseWrite) {

                //
                //  If the current run is not allocated then break out of the loop.
                //

                if (!NextIsAllocated) {

                    //
                    //  Convert to synchronous since we need to allocate space
                    //

                    if (*Wait == FALSE) {

                        *Wait = TRUE;
                        SetFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

                        ClearFlag( IrpContext->Union.NtfsIoContext->Flags, NTFS_IO_CONTEXT_ASYNC );
                        KeInitializeEvent( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent,
                                           NotificationEvent,
                                           FALSE );
                    }

                    break;

                }

                //
                //  Deal with the case where the last run in this transfer was not allocated.
                //  In that case we would have allocated a compression buffer and stored
                //  the original Mdl into the compression context.  Since this is an allocated
                //  range we can use the original user buffer and Mdl.  Restore these
                //  back into the original irp now.
                //
                //  If this file is encrypted, we do NOT want to change the buffer offset,
                //  because this offset will be stored as the first IoRun's buffer offset, and
                //  encrypt buffers will add the system buffer offset to that, and end up
                //  passing a bad buffer to the encryption driver.  Besides, it's inefficient
                //  to deallocate the buffer, since encrypt buffers will have to reallocate it.
                //

                if ((CompressionContext->SavedMdl != NULL) &&
                    (Scb->EncryptionContext == NULL)) {

                    NtfsDeallocateCompressionBuffer( Irp, CompressionContext, TRUE );
                    BufferOffset = CompressionContext->SystemBufferOffset;
                }
            }

            //
            //  Adjust from NextLcn to Lbo.  NextByteCount may overflow out of 32 bits
            //  but we will catch that below when we compare clusters.
            //

            NextLcnOffset = ((ULONG)StartVbo) & Vcb->ClusterMask;

            NextByteCount = BytesFromClusters( Vcb, (ULONG)NextClusterCount ) - NextLcnOffset;

            //
            // If next run is larger than we need, "ya get what you need".
            // Note that after this we are guaranteed that the HighPart of
            // NextByteCount is 0.
            //

            if ((ULONG)NextClusterCount >= ClustersFromBytes( Vcb, ByteCount + NextLcnOffset )) {

                NextByteCount = ByteCount;
            }

            //
            //  If the byte count is zero then we will spin indefinitely.  Raise
            //  corrupt here so the system doesn't hang.
            //

            if (NextByteCount == 0) {

                NtfsRaiseStatus( IrpContext, STATUS_FILE_CORRUPT_ERROR, NULL, Scb->Fcb );
            }

            //
            //  If this is a USA-protected structure, broken up in
            //  multiple runs, then we want to guarantee that we do
            //  not end up in the middle of a Usa-protected structure in the read path.
            //  Therefore, on the first run we will calculate the
            //  initial UsaOffset.  Then in the worst case it can
            //  take the remaining four runs to finish the Usa structure.
            //
            //  On the first subsequent run to complete a Usa structure,
            //  we set the count to end exactly on a Usa boundary.
            //

            if (FlagOn( Scb->ScbState, SCB_STATE_USA_PRESENT ) &&
                (ReadRequest)) {

                //
                //  So long as we know there are more IO runs left than the maximum
                //  number needed for the USA structure just maintain the current
                //  Usa offset.
                //

                if (*NumberRuns < LastStartUsaIoRun) {

                    UsaOffset = (UsaOffset + NextByteCount) & (StructureSize - 1);

                //
                //  Now we will stop on the next Usa boundary, but we may not
                //  have it yet.
                //

                } else {

                    if ((NextByteCount + UsaOffset) >= StructureSize) {

                        NextByteCount = ((NextByteCount + UsaOffset) & ~(StructureSize - 1)) -
                                        (UsaOffset & (StructureSize - 1));
                        StopForUsa = TRUE;
                    }

                    UsaOffset += NextByteCount;
                }
            }

            //
            //  Only fill in the run array if the run is allocated.
            //

            if (NextIsAllocated) {

                //
                //  Adjust if the Lcn offset (if we have one) and isn't zero.
                //

                NextLbo = LlBytesFromClusters( Vcb, NextLcn );
                NextLbo = NextLbo + NextLcnOffset;

                //
                // Now that we have properly bounded this piece of the
                // transfer, it is time to write it.
                //
                // We remember each piece of a parallel run by saving the
                // essential information in the IoRuns array.  The tranfers
                // are started up in parallel below.
                //

                IoRuns[*NumberRuns].StartingVbo = StartVbo;
                IoRuns[*NumberRuns].StartingLbo = NextLbo;
                IoRuns[*NumberRuns].BufferOffset = BufferOffset;
                IoRuns[*NumberRuns].ByteCount = NextByteCount;
                BytesInIoRuns += NextByteCount;
                *NumberRuns += 1;

            } else if (ReadRequest) {

                SystemBuffer = Add2Ptr( NtfsMapUserBuffer( Irp, NormalPagePriority ), BufferOffset );

                //
                //  If this is not a compressed stream then fill this range with zeroes.
                //  Also if this is a sparse, non-compressed stream then check if we need to
                //  reserve clusters.
                //

                if (!FlagOn( StreamFlags, COMPRESSED_STREAM )) {

#ifdef SYSCACHE_DEBUG
                    if (ScbIsBeingLogged( Scb )) {
                        FsRtlLogSyscacheEvent( Scb, SCE_ZERO_NC, SCE_FLAG_NON_CACHED | SCE_FLAG_READ | SCE_FLAG_PREPARE_BUFFERS, StartVbo + BufferOffset, NextByteCount, 0 );
                    }
#endif

                    RtlZeroMemory( SystemBuffer, NextByteCount );

                    if (FlagOn( Irp->Flags, IRP_PAGING_IO ) &&
                        FlagOn( Scb->Header.Flags, FSRTL_FLAG_USER_MAPPED_FILE ) &&
                        (FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK | ATTRIBUTE_FLAG_SPARSE ) == ATTRIBUTE_FLAG_SPARSE)) {

                        if (!NtfsReserveClusters( IrpContext,
                                                  Scb,
                                                  StartVbo,
                                                  NextByteCount )) {

                            NtfsRaiseStatus( IrpContext, STATUS_DISK_FULL, NULL, NULL );
                        }
                    }

                //
                //  If it is compressed then make sure the range begins with a zero in
                //  case MM passed a non-zeroed buffer.  Then the compressed read/write
                //  routines will know the chunk begins with a zero.
                //

                } else {

                    *((PULONG) SystemBuffer) = 0;
                }
            }

            //
            // Now adjust everything for the next pass through the loop.
            //

            StartVbo = StartVbo + NextByteCount;
            BufferOffset += NextByteCount;
            ByteCount -= NextByteCount;
        }

        //
        //  Let's remember about those bytes we trimmed off above.  We have more
        //  bytes remaining than we think, and we didn't transfer as much, so we
        //  need to back up where we start the next transfer.
        //

        if (TrimmedByteCount != 0) {

            DebugTrace( 0,
                        Dbg,
                        ("\nByteCount + TrimmedByteCount = %x + %x = %x",
                         ByteCount,
                         TrimmedByteCount,
                         ByteCount + TrimmedByteCount) );

            DebugTrace( 0,
                        Dbg,
                        ("\nStartVbo - TrimmedByteCount = %I64x - %x = %I64x",
                         StartVbo,
                         TrimmedByteCount,
                         StartVbo - TrimmedByteCount) );

            ByteCount += TrimmedByteCount;
        }

        //
        //  If this is a sparse write and the start of the write is unallocated then drop
        //  down to the compressed path below.  Otherwise do the IO we found.
        //

        if (!SparseWrite || (BytesInIoRuns != 0)) {

            return ByteCount;
        }
    }

    ASSERT( Scb->Header.NodeTypeCode == NTFS_NTC_SCB_DATA  );

    //
    //  Initialize the compression parameters.
    //

    CompressionUnit = Scb->CompressionUnit;
    CompressionUnitInClusters = ClustersFromBytes(Vcb, CompressionUnit);
    CompressionUnitOffset = 0;
    if (CompressionUnit != 0) {
        CompressionUnitOffset = ((ULONG)StartVbo) & (CompressionUnit - 1);
    }

    //
    //  We want to make sure and wait to get byte count and things correctly.
    //

    if (!FlagOn(IrpContext->State, IRP_CONTEXT_STATE_WAIT)) {
        NtfsRaiseStatus( IrpContext, STATUS_CANT_WAIT, NULL, NULL );
    }

    //
    //  Handle the compressed read case.
    //

    if (IrpContext->MajorFunction == IRP_MJ_READ) {

        //
        //  If we have not already mapped the user buffer, then do it.
        //

        if (CompressionContext->SystemBuffer == NULL) {
            CompressionContext->SystemBuffer = NtfsMapUserBuffer( Irp, NormalPagePriority );
        }

        BytesInIoRuns = 0;

        //
        //  Adjust StartVbo and ByteCount by the offset.
        //

        ((ULONG)StartVbo) -= CompressionUnitOffset;
        ByteCount += CompressionUnitOffset;

        //
        //  Capture this value for maintaining the byte count to
        //  return.
        //

        ReturnByteCount = ByteCount;

        //
        //  Now, the ByteCount we actually have to process has to
        //  be rounded up to the next compression unit.
        //

        ByteCount = BlockAlign( ByteCount, (LONG)CompressionUnit );

        //
        //  Make sure we never try to handle more than a LARGE_BUFFER_SIZE
        //  at once, forcing our caller to call back.
        //

        if (ByteCount > LARGE_BUFFER_SIZE) {
            ByteCount = LARGE_BUFFER_SIZE;
        }

        //
        //  In case we find no allocation....
        //

        IoRuns[0].ByteCount = 0;

        while (ByteCount != 0) {

            //
            // Try to lookup the first run.  If there is just a single run,
            // we may just be able to pass it on.
            //

            ASSERT( !FlagOn( ((ULONG) StartVbo), Vcb->ClusterMask ));
            StartingVcn = LlClustersFromBytesTruncate( Vcb, StartVbo );

            NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                    Scb,
                                                    StartingVcn,
                                                    &NextLcn,
                                                    &NextClusterCount,
                                                    NULL,
                                                    NULL );

#if (defined(NTFS_RWCMP_TRACE))
            ASSERT(!IsSyscache(Scb) || (NextClusterCount < 16) || !NextIsAllocated);
#endif

            //
            //  Adjust from NextLcn to Lbo.
            //
            //  If next run is larger than we need, "ya get what you need".
            //  Note that after this we are guaranteed that the HighPart of
            //  NextByteCount is 0.
            //


            if ((ULONG)NextClusterCount >= ClustersFromBytes( Vcb, ByteCount )) {

                NextByteCount = ByteCount;

            } else {

                NextByteCount = BytesFromClusters( Vcb, (ULONG)NextClusterCount );
            }

            //
            //  Adjust if the Lcn offset isn't zero.
            //

            NextLbo = LlBytesFromClusters( Vcb, NextLcn );

            //
            //  Only fill in the run array if the run is allocated.
            //

            if (NextIsAllocated) {

                //
                //  If the Lbos are contiguous, then we can do a contiguous
                //  transfer, so we just increase the current byte count.
                //

                if ((*NumberRuns != 0) && (NextLbo ==
                                           (IoRuns[*NumberRuns - 1].StartingLbo +
                                            (IoRuns[*NumberRuns - 1].ByteCount)))) {

                    //
                    //  Stop on the first compression unit boundary after the
                    //  the penultimate run in the default io array.
                    //

                    if (*NumberRuns >= NTFS_MAX_PARALLEL_IOS - 1) {

                        //
                        //  First, if we are beyond the penultimate run and we are starting
                        //  a run in a different compression unit than the previous
                        //  run, then we can just break out and not use the current
                        //  run.  (*NumberRuns has not yet been incremented.)
                        //  In order for it to be in the same run it can't begin at
                        //  offset 0 in the compression unit and it must be contiguous
                        //  with the virtual end of the previous run.
                        //  The only case where this can happen in the running system is
                        //  if there is a file record boundary in the middle of the
                        //  compression unit.
                        //

                        if ((*NumberRuns > NTFS_MAX_PARALLEL_IOS - 1) &&
                            (!FlagOn( (ULONG) StartVbo, CompressionUnit - 1 ) ||
                            (StartVbo != (IoRuns[*NumberRuns - 1].StartingVbo +
                                          IoRuns[*NumberRuns - 1].ByteCount)))) {

                            break;

                        //
                        //  Else detect the case where this run ends on or
                        //  crosses a compression unit boundary.  In this case,
                        //  just make sure the run stops on a compression unit
                        //  boundary, and break out to return it.
                        //

                        } else if ((((ULONG) StartVbo & (CompressionUnit - 1)) + NextByteCount) >=
                                   CompressionUnit) {

                            NextByteCount -= (((ULONG)StartVbo) + NextByteCount) & (CompressionUnit - 1);
                            BytesInIoRuns += NextByteCount;

                            if (ReturnByteCount > NextByteCount) {
                                ReturnByteCount -= NextByteCount;
                            } else {
                                ReturnByteCount = 0;
                            }

                            IoRuns[*NumberRuns - 1].ByteCount += NextByteCount;

                            break;
                        }
                    }

                    IoRuns[*NumberRuns - 1].ByteCount += NextByteCount;

                //
                //  Otherwise it is time to start a new run, if there is space for one.
                //

                } else {

                    //
                    //  If we have filled up the current I/O runs array, then we
                    //  will grow it once to a size which would allow the worst
                    //  case compression unit (all noncontiguous clusters) to
                    //  start at index NTFS_MAX_PARALLEL_IOS - 1.
                    //  The following if statement enforces
                    //  this case as the worst case.  With 16 clusters per compression
                    //  unit, the theoretical maximum number of parallel I/Os
                    //  would be 16 + NTFS_MAX_PARALLEL_IOS - 1, since we stop on the
                    //  first compression unit boundary after the penultimate run.
                    //  Normally, of course we will do much fewer.
                    //

                    if ((*NumberRuns == NTFS_MAX_PARALLEL_IOS) &&
                        (CompressionContext->AllocatedRuns == NTFS_MAX_PARALLEL_IOS)) {

                        PIO_RUN NewIoRuns;

                        NewIoRuns = NtfsAllocatePool( NonPagedPool,
                                                       (CompressionUnitInClusters + NTFS_MAX_PARALLEL_IOS - 1) * sizeof(IO_RUN) );

                        RtlCopyMemory( NewIoRuns,
                                       CompressionContext->IoRuns,
                                       NTFS_MAX_PARALLEL_IOS * sizeof(IO_RUN) );

                        IoRuns = CompressionContext->IoRuns = NewIoRuns;
                        CompressionContext->AllocatedRuns = CompressionUnitInClusters + NTFS_MAX_PARALLEL_IOS - 1;
                    }

                    //
                    // We remember each piece of a parallel run by saving the
                    // essential information in the IoRuns array.  The tranfers
                    // will be started up in parallel below.
                    //

                    ASSERT(*NumberRuns < CompressionContext->AllocatedRuns);

                    IoRuns[*NumberRuns].StartingVbo = StartVbo;
                    IoRuns[*NumberRuns].StartingLbo = NextLbo;
                    IoRuns[*NumberRuns].BufferOffset = BufferOffset;
                    IoRuns[*NumberRuns].ByteCount = NextByteCount;
                    if ((*NumberRuns + 1) < CompressionContext->AllocatedRuns) {
                        IoRuns[*NumberRuns + 1].ByteCount = 0;
                    }

                    //
                    //  Stop on the first compression unit boundary after the
                    //  penultimate run in the default array.
                    //

                    if (*NumberRuns >= NTFS_MAX_PARALLEL_IOS - 1) {

                        //
                        //  First, if we are beyond penultimate run and we are starting
                        //  a run in a different compression unit than the previous
                        //  run, then we can just break out and not use the current
                        //  run.  (*NumberRuns has not yet been incremented.)
                        //

                        if ((*NumberRuns > NTFS_MAX_PARALLEL_IOS - 1) &&
                            ((((ULONG)StartVbo) & ~(CompressionUnit - 1)) !=
                             ((((ULONG)IoRuns[*NumberRuns - 1].StartingVbo) +
                               IoRuns[*NumberRuns - 1].ByteCount - 1) &
                               ~(CompressionUnit - 1)))) {

                            break;

                        //
                        //  Else detect the case where this run ends on or
                        //  crosses a compression unit boundary.  In this case,
                        //  just make sure the run stops on a compression unit
                        //  boundary, and break out to return it.
                        //

                        } else if ((((ULONG)StartVbo) & ~(CompressionUnit - 1)) !=
                            ((((ULONG)StartVbo) + NextByteCount) & ~(CompressionUnit - 1))) {

                            NextByteCount -= (((ULONG)StartVbo) + NextByteCount) & (CompressionUnit - 1);
                            IoRuns[*NumberRuns].ByteCount = NextByteCount;
                            BytesInIoRuns += NextByteCount;

                            if (ReturnByteCount > NextByteCount) {
                                ReturnByteCount -= NextByteCount;
                            } else {
                                ReturnByteCount = 0;
                            }

                            *NumberRuns += 1;
                            break;
                        }
                    }
                    *NumberRuns += 1;
                }

                BytesInIoRuns += NextByteCount;
                BufferOffset += NextByteCount;
            }

            //
            //  Now adjust everything for the next pass through the loop.
            //

            StartVbo += NextByteCount;
            ByteCount -= NextByteCount;

            if (ReturnByteCount > NextByteCount) {
                ReturnByteCount -= NextByteCount;
            } else {
                ReturnByteCount = 0;
            }
        }

        //
        //  Allocate the compressed buffer if it is not already allocated.
        //

        if (BytesInIoRuns < CompressionUnit) {
            BytesInIoRuns = CompressionUnit;
        }
        NtfsAllocateCompressionBuffer( IrpContext, Scb, Irp, CompressionContext, &BytesInIoRuns );

        return ReturnByteCount;

    //
    //  Otherwise handle the compressed write case
    //

    } else {

        LONGLONG SavedValidDataToDisk;
        PUCHAR UncompressedBuffer;
        ULONG UncompressedOffset;
        ULONG ClusterOffset;
        BOOLEAN NoopRange;

        ULONG CompressedOffset;
        PBCB Bcb;

        ASSERT(IrpContext->MajorFunction == IRP_MJ_WRITE);

        //
        //  Adjust StartVbo and ByteCount by the offset.
        //

        ((ULONG)StartVbo) -= CompressionUnitOffset;
        ByteCount += CompressionUnitOffset;

        //
        //  Maintain additional bytes to be returned in ReturnByteCount,
        //  and adjust this if we are larger than a LARGE_BUFFER_SIZE.
        //

        ReturnByteCount = 0;
        if (ByteCount > LARGE_BUFFER_SIZE) {
            ReturnByteCount = ByteCount - LARGE_BUFFER_SIZE;
            ByteCount = LARGE_BUFFER_SIZE;
        }

        CompressedSize = ByteCount;
        if (!FlagOn( StreamFlags, COMPRESSED_STREAM ) && (CompressionUnit != 0)) {

            //
            //  To reduce pool consumption, make an educated/optimistic guess on
            //  how much pool we need to store the compressed data.  If we are wrong
            //  we will just have to do some more I/O.
            //

            CompressedSize = BlockAlign( ByteCount, (LONG)CompressionUnit );
            CompressedSize += Vcb->BytesPerCluster;

            if (CompressedSize > LARGE_BUFFER_SIZE) {
                CompressedSize = LARGE_BUFFER_SIZE;
            }

            //
            //  Allocate the compressed buffer if it is not already allocated, and this
            //  isn't the compressed stream.
            //

            if (SparseWrite &&
                (CompressionContext->SystemBuffer == NULL)) {

                CompressionContext->SystemBuffer = NtfsMapUserBuffer( Irp, NormalPagePriority );
            }

            //
            //  At this point BufferOffset should always be 0.
            //

            BufferOffset = 0;
            NtfsAllocateCompressionBuffer( IrpContext, Scb, Irp, CompressionContext, &CompressedSize );

            CompressionContext->DataTransformed = TRUE;
        }

        //
        //  Loop to compress the user's buffer.
        //

        CompressedOffset = 0;
        UncompressedOffset = 0;
        Bcb = NULL;

        try {

            BOOLEAN ChangeAllocation;
            ULONG SparseFileBias;

            //
            //  Loop as long as we will not overflow our compressed buffer, and we
            //  are also guanteed that we will not overflow the extended IoRuns array
            //  in the worst case (and as long as we have more write to satisfy!).
            //

            while ((ByteCount != 0) && (*NumberRuns <= NTFS_MAX_PARALLEL_IOS - 1) &&
                   (((CompressedOffset + CompressionUnit) <= CompressedSize) ||
                    FlagOn( StreamFlags, COMPRESSED_STREAM ))) {

                LONGLONG SizeToCompress;

                //
                //  State variables to determine a reallocate range.
                //

                VCN DeleteVcn;
                LONGLONG DeleteCount;
                LONGLONG AllocateCount;

                DeleteCount = 0;
                AllocateCount = 0;

                NoopRange = FALSE;
                SparseFileBias = 0;
                ClusterOffset = 0;

                //
                //  Assume we are only compressing to FileSize, or else
                //  reduce to one compression unit.  The maximum compression size
                //  we can accept is saving at least one cluster.
                //

                NtfsAcquireFsrtlHeader( Scb );

                //
                //  If this is a compressed stream then we may need to go past file size.
                //

                if (FlagOn( StreamFlags, COMPRESSED_STREAM)) {

                    SizeToCompress = BlockAlign( Scb->Header.FileSize.QuadPart, (LONG)CompressionUnit );
                    SizeToCompress -= StartVbo;

                } else {

                    SizeToCompress = Scb->Header.FileSize.QuadPart - StartVbo;
                }

                NtfsReleaseFsrtlHeader( Scb );

                //
                //  It is possible that if this is the lazy writer that the file
                //  size was rolled back from a cached write which is aborting.
                //  In that case we either truncate the write or can exit this
                //  loop if there is nothing left to write.
                //

                if (SizeToCompress <= 0) {

                    ByteCount = 0;
                    break;
                }

                //
                //  Note if CompressionUnit is 0, then we do not need SizeToCompress.
                //

                if (SizeToCompress > CompressionUnit) {
                    SizeToCompress = (LONGLONG)CompressionUnit;
                }

#ifdef  COMPRESS_ON_WIRE
                //
                //  For the normal uncompressed stream, map the data and compress it
                //  into the allocated buffer.
                //

                if (!FlagOn( StreamFlags, COMPRESSED_STREAM )) {

#endif

                    //
                    //  If this is a sparse write then we zero the beginning and
                    //  end of the compression unit as needed and copy in the user
                    //  data.
                    //

                    if (SparseWrite) {

                        //
                        //  Use local variables to position ourselves in the
                        //  compression context buffer and user system buffer.
                        //  We'll reuse StructureSize to show the number of
                        //  user bytes copied to the buffer.
                        //

                        SystemBuffer = Add2Ptr( CompressionContext->SystemBuffer,
                                                CompressionContext->SystemBufferOffset + UncompressedOffset );

                        UncompressedBuffer = Add2Ptr( CompressionContext->CompressionBuffer,
                                                      BufferOffset );

                        //
                        //  Zero the beginning of the compression buffer if necessary.
                        //

                        if (CompressionUnitOffset != 0) {

#ifdef SYSCACHE_DEBUG
                            if (ScbIsBeingLogged( Scb )) {
                                FsRtlLogSyscacheEvent( Scb, SCE_ZERO_NC, SCE_FLAG_NON_CACHED | SCE_FLAG_PREPARE_BUFFERS | SCE_FLAG_WRITE, StartVbo, CompressionUnitOffset, 0 );
                            }
#endif

                            RtlZeroMemory( UncompressedBuffer, CompressionUnitOffset );
                            UncompressedBuffer += CompressionUnitOffset;
                        }

                        //
                        //  Now copy the user data into the buffer.
                        //

                        if ((ULONG) SizeToCompress < ByteCount) {

                            StructureSize = (ULONG) BlockAlign( SizeToCompress, (LONG)Vcb->BytesPerSector ) - CompressionUnitOffset;

                        } else {

                            StructureSize = ByteCount - CompressionUnitOffset;
                        }

                        RtlCopyMemory( UncompressedBuffer,
                                       SystemBuffer,
                                       StructureSize );

                        //
                        //  It may be necessary to zero the end of the buffer.
                        //

                        if ((ULONG) SizeToCompress > ByteCount) {

#ifdef SYSCACHE_DEBUG
                            if (ScbIsBeingLogged( Scb )) {
                                FsRtlLogSyscacheEvent( Scb, SCE_ZERO_NC, SCE_FLAG_NON_CACHED | SCE_FLAG_PREPARE_BUFFERS | SCE_FLAG_WRITE, StartVbo + StructureSize, SizeToCompress - ByteCount, 1 );
                            }
#endif

                            RtlZeroMemory( Add2Ptr( UncompressedBuffer, StructureSize ),
                                           (ULONG) SizeToCompress - ByteCount );
                        }

                        FinalCompressedSize = CompressionUnit;
                        Status = STATUS_SUCCESS;

                    } else {

                        UncompressedBuffer = NULL;
                        if (CompressionUnit != 0) {

                            //
                            //  Map the aligned range, set it dirty, and flush.  We have to
                            //  loop, because the Cache Manager limits how much and over what
                            //  boundaries we can map.  Only do this if there a file
                            //  object.  Otherwise we will assume we are writing the
                            //  clusters directly to disk (via NtfsWriteClusters).
                            //

                            if (Scb->FileObject != NULL) {

                                CcMapData( Scb->FileObject,
                                           (PLARGE_INTEGER)&StartVbo,
                                           (ULONG)SizeToCompress,
                                           TRUE,
                                           &Bcb,
                                           &UncompressedBuffer );

#ifdef MAPCOUNT_DBG
                                IrpContext->MapCount += 1;
#endif

                            } else {

                                UncompressedBuffer = MmGetSystemAddressForMdlSafe( CompressionContext->SavedMdl, NormalPagePriority );

                                if (UncompressedBuffer == NULL) {

                                    NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
                                }
                            }

                            //
                            //  If we have not already allocated the workspace, then do it.  We don't
                            //  need the workspace if the file is not compressed (i.e. sparse).
                            //

                            if ((CompressionContext->WorkSpace == NULL) &&
                                FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK )) {

                                ULONG CompressWorkSpaceSize;
                                ULONG FragmentWorkSpaceSize;

                                (VOID) RtlGetCompressionWorkSpaceSize( (USHORT)((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) + 1),
                                                                       &CompressWorkSpaceSize,
                                                                       &FragmentWorkSpaceSize );

                                //
                                //  It is critical to ask for the work space buffer.  It is the only
                                //  one large enough to hold the bigger ia64 pointers.
                                //

                                NtfsCreateMdlAndBuffer( IrpContext,
                                                        Scb,
                                                        RESERVED_BUFFER_WORKSPACE_NEEDED,
                                                        &CompressWorkSpaceSize,
                                                        NULL,
                                                        &CompressionContext->WorkSpace );
                            }
                        }

                        try {

                            //
                            //  If we are moving an uncompressed file, then do not compress
                            //

                            if (CompressionUnit == 0) {
                                FinalCompressedSize = ByteCount;
                                Status = STATUS_SUCCESS;

                            //
                            //  If we are writing compressed, compress it now.
                            //

                            } else if (!FlagOn(Scb->ScbState, SCB_STATE_WRITE_COMPRESSED) ||
                                ((Status =
                                  RtlCompressBuffer( (USHORT)((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) + 1),
                                                     UncompressedBuffer,
                                                     (ULONG)SizeToCompress,
                                                     CompressionContext->CompressionBuffer + CompressedOffset,
                                                     (CompressionUnit - Vcb->BytesPerCluster),
                                                     NTFS_CHUNK_SIZE,
                                                     &FinalCompressedSize,
                                                     CompressionContext->WorkSpace )) ==

                                                    STATUS_BUFFER_TOO_SMALL)) {

                                //
                                //  If it did not compress, just copy it over, sigh.  This looks bad,
                                //  but it should virtually never occur assuming compression is working
                                //  ok.  In the case where FileSize is in this unit, make sure we
                                //  at least copy to a sector boundary.
                                //

                                FinalCompressedSize = CompressionUnit;

                                if (!SparseWrite) {

                                    RtlCopyMemory( CompressionContext->CompressionBuffer + CompressedOffset,
                                                   UncompressedBuffer,
                                                   (ULONG)BlockAlign( SizeToCompress, (LONG)Vcb->BytesPerSector));
                                }

                                ASSERT(FinalCompressedSize <= (CompressedSize - CompressedOffset));
                                Status = STATUS_SUCCESS;
                            }

                        //
                        //  Probably Gary's compression routine faulted, but blame it on
                        //  the user buffer!
                        //

                        } except(NtfsCompressionFilter(IrpContext, GetExceptionInformation())) {
                            NtfsRaiseStatus( IrpContext, STATUS_INVALID_USER_BUFFER, NULL, NULL );
                        }
                    }

                //
                //  For the compressed stream, we need to scan the compressed data
                //  to see how much we actually have to write.
                //

#ifdef  COMPRESS_ON_WIRE
                } else {

                    //
                    //  Don't walk off the end of the data being written, because that
                    //  would cause bogus faults in the compressed stream.
                    //

                    if (SizeToCompress > ByteCount) {
                        SizeToCompress = ByteCount;
                    }

                    //
                    //  Map the compressed data.
                    //

                    CcMapData( Scb->Header.FileObjectC,
                               (PLARGE_INTEGER)&StartVbo,
                               (ULONG)SizeToCompress,
                               TRUE,
                               &Bcb,
                               &UncompressedBuffer );

#ifdef MAPCOUNT_DBG
                    IrpContext->MapCount++;
#endif

                    FinalCompressedSize = 0;

                    //
                    //  Loop until we get an error or stop advancing.
                    //

                    RangePtr = UncompressedBuffer + CompressionUnit;
                    do {
                        Status = RtlDescribeChunk( (USHORT)((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) + 1),
                                                   &UncompressedBuffer,
                                                   (PUCHAR)RangePtr,
                                                   (PUCHAR *)&SystemBuffer,
                                                   &CompressedSize );

                        //
                        //  Remember if we see any nonzero chunks
                        //

                        FinalCompressedSize |= CompressedSize;

                    } while (NT_SUCCESS(Status));

                    //
                    //  If we terminated on anything but STATUS_NO_MORE_ENTRIES, we
                    //  somehow picked up some bad data.
                    //

                    if (Status != STATUS_NO_MORE_ENTRIES) {
                        ASSERT(Status == STATUS_NO_MORE_ENTRIES);
                        NtfsRaiseStatus( IrpContext, Status, NULL, NULL );
                    }
                    Status = STATUS_SUCCESS;

                    //
                    //  If we got any nonzero chunks, then calculate size of buffer to write.
                    //  (Size does not include terminating Ushort of 0.)
                    //

                    if (FinalCompressedSize != 0) {

                        FinalCompressedSize = BlockAlignTruncate( (ULONG_PTR)UncompressedBuffer, (ULONG)CompressionUnit );

                        //
                        //  If the Lazy Writer is writing beyond the end of the compression
                        //  unit (there are dirty pages at the end of the compression unit)
                        //  then we can throw this data away.
                        //

                        if (FinalCompressedSize < CompressionUnitOffset) {

                            //
                            //  Set up to move to the next compression unit.
                            //

                            NoopRange = TRUE;
                            ChangeAllocation = FALSE;

                            //
                            //  Set TempVbo to the compression unit offset.  The
                            //  number of bytes to skip over is the remaining
                            //  bytes in a compression unit.
                            //

                            TempVbo = CompressionUnitOffset;

                        //
                        //  If the Lazy Writer does not have the beginning of the compression
                        //  unit then raise out of here and wait for the write which includes
                        //  the beginning.
                        //

                        } else if (CompressionUnitOffset != 0) {
#if defined(COMPRESS_ON_WIRE) && defined(NTFS_RWC_DEBUG)
                            ASSERT( !NtfsBreakOnConflict ||
                                    (Scb->LazyWriteThread[1] == PsGetCurrentThread()) );
#endif
                            NtfsRaiseStatus( IrpContext, STATUS_FILE_LOCK_CONFLICT, NULL, NULL );

                        //
                        //  If we saw more chunks than our writer is trying to write (it
                        //  more or less has to be the Lazy Writer), then we need to reject
                        //  this request and assume he will come back later for the entire
                        //  amount.  This could be a problem for WRITE_THROUGH.
                        //

                        } else if (FinalCompressedSize > ByteCount) {
#ifdef NTFS_RWC_DEBUG
                            ASSERT( !NtfsBreakOnConflict ||
                                    (Scb->LazyWriteThread[1] == PsGetCurrentThread()) );

                            ASSERT( Scb->LazyWriteThread[1] == PsGetCurrentThread() );
#endif
                            NtfsRaiseStatus( IrpContext, STATUS_FILE_LOCK_CONFLICT, NULL, NULL );
                        }
                    }
                }
#endif

                NtfsUnpinBcb( IrpContext, &Bcb );

                //
                //  Round the FinalCompressedSize up to a cluster boundary now.
                //

                FinalCompressedSize = (FinalCompressedSize + Vcb->BytesPerCluster - 1) &
                                      ~(Vcb->BytesPerCluster - 1);

                //
                //  If the Status was not success, then we have to do something.
                //

                if (Status != STATUS_SUCCESS) {

                    //
                    //  If it was actually an error, then we will raise out of
                    //  here.
                    //

                    if (!NT_SUCCESS(Status)) {
                        NtfsRaiseStatus( IrpContext, Status, NULL, NULL );

                    //
                    //  If the buffer compressed to all zeros, then we will
                    //  not allocate anything.
                    //

                    } else if (Status == STATUS_BUFFER_ALL_ZEROS) {
                        FinalCompressedSize = 0;
                    }
                }

                if (!NoopRange) {

                    StartingVcn = LlClustersFromBytesTruncate( Vcb, StartVbo );

                    //
                    //  Time to get the Scb if we do not have it already.  We
                    //  need to serialize our changes of the Mcb.
                    //  N.B. -- We may _not_ always be the top level request.
                    //  Converting a compressed stream to nonresident can
                    //  send us down this path with Irp != OriginatingIrp.
                    //

                    if (!CompressionContext->ScbAcquired) {

                        NtfsPurgeFileRecordCache( IrpContext );
                        NtfsAcquireExclusiveScb( IrpContext, Scb );
                        CompressionContext->ScbAcquired = TRUE;
                    }

                    NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                            Scb,
                                                            StartingVcn,
                                                            &NextLcn,
                                                            &NextClusterCount,
                                                            NULL,
                                                            NULL );

                    //
                    //  If this originally was a sparse write but we were defragging
                    //  then we need to be careful if the range is unallocated.  In
                    //  that case we really need to do the full sparse support.  Break
                    //  out of the loop at this point and perform the IO with
                    //  the ranges we already have.
                    //

                    if (!NextIsAllocated && OriginalSparseWrite && !SparseWrite) {

                        break;
                    }

                    //
                    //  If the StartingVcn is allocated, we always have to check
                    //  if we need to delete something, or if in the unusual case
                    //  there is a hole there smaller than a compression unit.
                    //
                    //  If this is a sparse write then we never have anything to
                    //  deallocate.
                    //

                    FinalCompressedClusters = ClustersFromBytes( Vcb, FinalCompressedSize );
                    ChangeAllocation = FALSE;

                    if (SparseWrite) {

                        //
                        //  It is possible that the compression unit has been allocated since we
                        //  tested allocation when we entered this routine.  If so we can
                        //  write directly to disk in the allocated range.  We need to
                        //  modify the range being written however.
                        //

                        if (NextIsAllocated) {

                            //
                            //  Move forward to the beginning of this write.
                            //

                            SparseFileBias = CompressionUnitOffset;
                            ((ULONG) StartVbo) += CompressionUnitOffset;
                            CompressedOffset += CompressionUnitOffset;
                            BufferOffset += CompressionUnitOffset;
                            FinalCompressedSize -= CompressionUnitOffset;

                            if (FinalCompressedSize > (ByteCount - CompressionUnitOffset)) {

                                FinalCompressedSize = (ByteCount - CompressionUnitOffset);
                            }

                            StartingVcn = LlClustersFromBytesTruncate( Vcb, StartVbo );

                            //
                            //  Remember that we might not be on a cluster boundary at this point.
                            //

                            ClusterOffset = (ULONG) StartVbo & Vcb->ClusterMask;

                            //
                            //  Look up the correct range on the disk.
                            //

                            NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                                    Scb,
                                                                    StartingVcn,
                                                                    &NextLcn,
                                                                    &NextClusterCount,
                                                                    NULL,
                                                                    NULL );

                            ASSERT( NextIsAllocated );

                        } else {

                            //
                            //  Set the Scb flag to indicate we need to serialize non-cached IO
                            //  with the Mcb.
                            //

                            SetFlag( Scb->ScbState, SCB_STATE_PROTECT_SPARSE_MCB );
                        }

                    } else if (NextIsAllocated || (NextClusterCount < CompressionUnitInClusters)) {

                        VCN TempClusterCount;

                        //
                        //  If we need fewer clusters than allocated, then just allocate them.
                        //  But if we need more clusters, then deallocate all the ones we have
                        //  now, otherwise we could corrupt file data if we back out a write
                        //  after actually having written the sectors.  (For example, we could
                        //  extend from 5 to 6 clusters and write 6 clusters of compressed data.
                        //  If we have to back that out we will have a 6-cluster pattern of
                        //  compressed data with one sector deallocated!).
                        //

                        NextIsAllocated = NextIsAllocated &&
                                          (NextClusterCount >= FinalCompressedClusters);

                        //
                        //  If we are cleaning up a hole, or the next run is unuseable,
                        //  then make sure we just delete it rather than sliding the
                        //  tiny run up with SplitMcb.  Note that we have the Scb exclusive,
                        //  and that since all compressed files go through the cache, we
                        //  know that the dirty pages can't go away even if we spin out
                        //  of here with ValidDataToDisk bumped up too high.
                        //

                        SavedValidDataToDisk = Scb->ValidDataToDisk;
                        if (!NextIsAllocated && ((StartVbo + CompressionUnit) > Scb->ValidDataToDisk)) {
                            Scb->ValidDataToDisk = StartVbo + CompressionUnit;
                        }

                        //
                        //  Also, we need to handle the case where a range within
                        //  ValidDataToDisk is fully allocated.  If we are going to compress
                        //  now, then we have the same problem with failing after writing
                        //  the compressed data out, i.e., because we are fully allocated
                        //  we would see the data as uncompressed after an abort, yet we
                        //  have written compressed data. We do not implement the entire
                        //  loop necessary to really see if the compression unit is fully
                        //  allocated - we just verify that NextClusterCount is less than
                        //  a compression unit and that the next run is not allocated.  Just
                        //  because the next contiguous run is also allocated does not guarantee
                        //  that the compression unit is fully allocated, but maybe we will
                        //  get some small defrag gain by reallocating what we need in a
                        //  single run.
                        //

                        NextIsAllocated = NextIsAllocated &&
                                          ((StartVbo >= Scb->ValidDataToDisk) ||
                                           (FinalCompressedClusters == CompressionUnitInClusters) ||
                                           ((NextClusterCount < CompressionUnitInClusters) &&
                                            (!NtfsLookupAllocation( IrpContext,
                                                                    Scb,
                                                                    StartingVcn + NextClusterCount,
                                                                    &NextLbo,
                                                                    &TempClusterCount,
                                                                    NULL,
                                                                    NULL ) ||
                                             (NextLbo != UNUSED_LCN))));

                        //
                        //  If we are not keeping any allocation, or we need less
                        //  than a compression unit, then call NtfsDeleteAllocation.
                        //


                        if (!NextIsAllocated ||
                            (FinalCompressedClusters < CompressionUnitInClusters)) {

                            //
                            //  Skip this explicit delete if we are rewriting within
                            //  ValidDataToDisk.  We know we won't be doing a SplitMcb.
                            //

                            DeleteVcn = StartingVcn;

                            if (NextIsAllocated) {

                                DeleteVcn += FinalCompressedClusters;
                            }

                            DeleteCount = CompressionUnit;

                            if (CompressionUnit == 0) {

                                DeleteCount = ByteCount;
                            }

                            DeleteCount = LlClustersFromBytes( Vcb, DeleteCount );

                            //
                            //  Take the explicit DeleteAllocation path if there is a chance
                            //  we might do a SplitMcb.  This is true for a compressed write
                            //  which extends into a new compression unit.
                            //

                            if ((CompressionUnit != 0) &&

                                ((StartingVcn + DeleteCount) >
                                 LlClustersFromBytesTruncate( Vcb,
                                                              ((Scb->ValidDataToDisk + CompressionUnit - 1) &
                                                               ~((LONGLONG) (CompressionUnit - 1))) ))) {

                                NtfsDeleteAllocation( IrpContext,
                                                      IrpSp->FileObject,
                                                      Scb,
                                                      DeleteVcn,
                                                      StartingVcn + DeleteCount - 1,
                                                      TRUE,
                                                      FALSE );

                                //
                                //  Set the DeleteCount to 0 so we know there is no other deallocate
                                //  to do.
                                //

                                DeleteCount = 0;

                            //
                            //  Bias the DeleteCount by the number of clusters into the compression
                            //  unit we are beginning.
                            //

                            } else {

                                DeleteCount -= (DeleteVcn - StartingVcn);
                                ASSERT( DeleteCount >= 0 );
                            }

                            ChangeAllocation = TRUE;
                        }

                        Scb->ValidDataToDisk = SavedValidDataToDisk;
                    }

                    //
                    //  Now deal with the case where we do need to allocate space.
                    //

                    TempVbo = StartVbo;
                    if (FinalCompressedSize != 0) {

                        //
                        //  If this compression unit is not (sufficiently) allocated, then
                        //  do it now.
                        //

                        if (!NextIsAllocated ||
                            ((NextClusterCount < FinalCompressedClusters) && !SparseWrite)) {

                            AllocateCount = FinalCompressedClusters;

                        } else {

                            AllocateCount = 0;
                        }

                        //
                        //  Now call our reallocate routine to do the work.
                        //

                        if ((DeleteCount != 0) || (AllocateCount != 0)) {

#ifdef SYSCACHE_DEBUG
                            if (ScbIsBeingLogged( Scb )) {

                                FsRtlLogSyscacheEvent( Scb, SCE_ADD_ALLOCATION, SCE_FLAG_PREPARE_BUFFERS, StartingVcn, AllocateCount, DeleteCount );
                            }
#endif

                            NtfsReallocateRange( IrpContext,
                                                 Scb,
                                                 DeleteVcn,
                                                 DeleteCount,
                                                 StartingVcn,
                                                 AllocateCount,
                                                 NULL );

                            ChangeAllocation = TRUE;
                        }

                        //
                        //  If we added space, something may have moved, so we must
                        //  look up our position and get a new index. Also relookup
                        //  to get a rangeptr and index
                        //

                        NtfsLookupAllocation( IrpContext,
                                              Scb,
                                              StartingVcn,
                                              &NextLcn,
                                              &NextClusterCount,
                                              &RangePtr,
                                              &Index );

                        //
                        //  Now loop to update the IoRuns array.
                        //

                        CompressedOffset += FinalCompressedSize;
                        while (FinalCompressedSize != 0) {

                            LONGLONG RunOffset;

                            //
                            //  Get the actual number of clusters being written.
                            //

                            FinalCompressedClusters = ClustersFromBytes( Vcb, FinalCompressedSize );

                            //
                            //  Try to lookup the first run.  If there is just a single run,
                            //  we may just be able to pass it on.  Index into the Mcb directly
                            //  for greater speed.
                            //

                            NextIsAllocated = NtfsGetSequentialMcbEntry( &Scb->Mcb,
                                                                         &RangePtr,
                                                                         Index,
                                                                         &StartingVcn,
                                                                         &NextLcn,
                                                                         &NextClusterCount );

                            //
                            //  It is possible that we could walk across an Mcb boundary and the
                            //  following entry isn't loaded.  In that case we want to look the
                            //  up the allocation specifically to force the Mcb load.
                            //

                            if (Index == MAXULONG) {

                                //
                                //  A failure on NtfsGetSequentialMcbEntry above will modify StartingVcn.
                                //  Recalculate here based on TempVbo.
                                //

                                StartingVcn = LlClustersFromBytesTruncate( Vcb, TempVbo );
                                NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                                        Scb,
                                                                        StartingVcn,
                                                                        &NextLcn,
                                                                        &NextClusterCount,
                                                                        &RangePtr,
                                                                        &Index );

                                ASSERT( NextIsAllocated );
                                NextIsAllocated = NtfsGetSequentialMcbEntry( &Scb->Mcb,
                                                                             &RangePtr,
                                                                             Index,
                                                                             &StartingVcn,
                                                                             &NextLcn,
                                                                             &NextClusterCount );
                            }

                            Index += 1;

                            ASSERT(NextIsAllocated);
                            ASSERT(NextLcn != UNUSED_LCN);

                            //
                            //  Our desired Vcn could be in the middle of this run, so do
                            //  some adjustments.
                            //

                            RunOffset = Int64ShraMod32(TempVbo, Vcb->ClusterShift) - StartingVcn;

                            ASSERT( ((PLARGE_INTEGER)&RunOffset)->HighPart >= 0 );
                            ASSERT( NextClusterCount > RunOffset );

                            NextLcn = NextLcn + RunOffset;
                            NextClusterCount = NextClusterCount - RunOffset;

                            //
                            //  Adjust from NextLcn to Lbo.  NextByteCount may overflow out of 32 bits
                            //  but we will catch that below when we compare clusters.
                            //

                            NextLbo = LlBytesFromClusters( Vcb, NextLcn ) + ClusterOffset;
                            NextByteCount = BytesFromClusters( Vcb, (ULONG)NextClusterCount );

                            //
                            //  If next run is larger than we need, "ya get what you need".
                            //  Note that after this we are guaranteed that the HighPart of
                            //  NextByteCount is 0.
                            //

                            if (NextClusterCount >= FinalCompressedClusters) {

                                NextByteCount = FinalCompressedSize;
                            }

                            //
                            //  If the Lbos are contiguous, then we can do a contiguous
                            //  transfer, so we just increase the current byte count.
                            //  For compressed streams, note however that the BufferOffset
                            //  may not be contiguous!
                            //

                            if ((*NumberRuns != 0) &&
                                (NextLbo == (IoRuns[*NumberRuns - 1].StartingLbo +
                                                      IoRuns[*NumberRuns - 1].ByteCount)) &&
                                (BufferOffset == (IoRuns[*NumberRuns - 1].BufferOffset +
                                                      IoRuns[*NumberRuns - 1].ByteCount))) {

                                IoRuns[*NumberRuns - 1].ByteCount += NextByteCount;

                            //
                            //  Otherwise it is time to start a new run, if there is space for one.
                            //

                            } else {

                                //
                                //  If we have filled up the current I/O runs array, then we
                                //  will grow it once to a size which would allow the worst
                                //  case compression unit (all noncontiguous clusters) to
                                //  start at the penultimate index.  The following if
                                //  statement enforces this case as the worst case.  With 16
                                //  clusters per compression unit, the theoretical maximum
                                //  number of parallel I/Os would be 16 + NTFS_MAX_PARALLEL_IOS - 1,
                                //  since we stop on the first compression unit
                                //  boundary after the penultimate run.  Normally, of course we
                                //  will do much fewer.
                                //

                                if ((*NumberRuns == NTFS_MAX_PARALLEL_IOS) &&
                                    (CompressionContext->AllocatedRuns == NTFS_MAX_PARALLEL_IOS)) {

                                    PIO_RUN NewIoRuns;

                                    NewIoRuns = NtfsAllocatePool( NonPagedPool,
                                                                   (CompressionUnitInClusters + NTFS_MAX_PARALLEL_IOS - 1) * sizeof(IO_RUN) );

                                    RtlCopyMemory( NewIoRuns,
                                                   CompressionContext->IoRuns,
                                                   NTFS_MAX_PARALLEL_IOS * sizeof(IO_RUN) );

                                    IoRuns = CompressionContext->IoRuns = NewIoRuns;
                                    CompressionContext->AllocatedRuns = CompressionUnitInClusters + NTFS_MAX_PARALLEL_IOS - 1;
                                }

                                //
                                // We remember each piece of a parallel run by saving the
                                // essential information in the IoRuns array.  The tranfers
                                // will be started up in parallel below.
                                //

                                IoRuns[*NumberRuns].StartingVbo = TempVbo;
                                IoRuns[*NumberRuns].StartingLbo = NextLbo;
                                IoRuns[*NumberRuns].BufferOffset = BufferOffset;
                                IoRuns[*NumberRuns].ByteCount = NextByteCount;
                                *NumberRuns += 1;
                            }

                            //
                            // Now adjust everything for the next pass through the loop.
                            //

                            BufferOffset += NextByteCount;
                            TempVbo = TempVbo + NextByteCount;
                            FinalCompressedSize -= NextByteCount;
                            ClusterOffset = 0;
                        }

                    } else if (DeleteCount != 0) {

                        //
                        //  Call our reallocate routine.
                        //

                        NtfsReallocateRange( IrpContext,
                                             Scb,
                                             DeleteVcn,
                                             DeleteCount,
                                             0,
                                             0,
                                             NULL );

                        ChangeAllocation = TRUE;
                    }

                }

                //
                //  For the compressed stream, we need to advance the buffer offset to the
                //  end of a compression unit, so that if adjacent compression units are
                //  being written, we correctly advance over the unused clusters in the
                //  compressed stream.
                //


                if (FlagOn(StreamFlags, COMPRESSED_STREAM)) {
                    BufferOffset += CompressionUnit - (ULONG)(TempVbo & (CompressionUnit - 1));
                }

                //
                //  If this is the unnamed data stream then we need to update
                //  the total allocated size.
                //

                if (ChangeAllocation &&
                    FlagOn( Scb->ScbState, SCB_STATE_UNNAMED_DATA ) &&
                    (Scb->Fcb->Info.AllocatedLength != Scb->TotalAllocated)) {

                    Scb->Fcb->Info.AllocatedLength = Scb->TotalAllocated;
                    SetFlag( Scb->Fcb->InfoFlags, FCB_INFO_CHANGED_ALLOC_SIZE );
                }

                UncompressedOffset += CompressionUnit - CompressionUnitOffset;

                //
                //  Now reduce the byte counts by the compression unit we just
                //  transferred.
                //

                if ((CompressionUnit != 0) && (ByteCount > CompressionUnit)) {
                    StartVbo += (CompressionUnit - SparseFileBias);
                    ByteCount -= CompressionUnit;
                } else {
                    StartVbo += (ByteCount - SparseFileBias);
                    ByteCount = 0;
                    leave;
                }

                CompressionUnitOffset = 0;
            }

        } finally {

            NtfsUnpinBcb( IrpContext, &Bcb );
        }

        //
        //  See if we need to advance ValidDataToDisk.
        //

        if (FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK ) &&
            (StartVbo > Scb->ValidDataToDisk)) {

            ASSERT( (Scb->ScbSnapshot != NULL) && (Scb->ScbSnapshot->ValidDataToDisk == Scb->ValidDataToDisk) );
            Scb->ValidDataToDisk = StartVbo;
        }

        return ByteCount + ReturnByteCount;
    }
}


//
//  Internal support routine
//

NTSTATUS
NtfsFinishBuffers (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN PVBO StartingVbo,
    IN ULONG ByteCount,
    IN ULONG NumberRuns,
    IN PCOMPRESSION_CONTEXT CompressionContext,
    IN ULONG StreamFlags
    )

/*++

Routine Description:

    This routine performs post processing for noncached transfers of
    compressed or encrypted data.  For reads, the decompression actually takes
    place here.  For reads and writes, all necessary cleanup operations are
    performed.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Supplies the stream file to act on.

    StartingVbo - The starting point for the operation.

    ByteCount - The lengh of the operation.

    CompressionContext - Supplies information related to the compression
                         filled in by NtfsPrepareBuffers.

    StreamFlags - Supplies either 0 or some combination of COMPRESSED_STREAM
                  and ENCRYPTED_STREAM

Return Value:

    Status from the operation

--*/

{
    VCN CurrentVcn, NextVcn, BeyondLastVcn;
    LCN NextLcn;

    ULONG Run;

    ULONG NextByteCount;
    LONGLONG NextClusterCount;

    LARGE_INTEGER OffsetWithinFile;

    BOOLEAN NextIsAllocated;
    BOOLEAN AlreadyFilled;

    PVOID SystemBuffer = NULL;

    ULONG CompressionUnit, CompressionUnitInClusters;
    ULONG StartingOffset, UncompressedOffset, CompressedOffset;
    ULONG CompressedSize;
    LONGLONG UncompressedSize;

    LONGLONG CurrentAllocatedClusterCount;

    NTSTATUS Status = STATUS_SUCCESS;

    PVCB Vcb = Scb->Vcb;

    PAGED_CODE();

    //
    //  If this is a normal termination of a read, then let's give him the
    //  data...
    //

    ASSERT( (Scb->CompressionUnit != 0) ||
            (Scb->EncryptionContext != NULL) ||
            FlagOn( StreamFlags, COMPRESSED_STREAM ) );

    //
    //  We never want to be here if this is the read raw encrypted data case.
    //

    ASSERT( !FlagOn( StreamFlags, ENCRYPTED_STREAM ) );

    if (IrpContext->MajorFunction == IRP_MJ_READ) {

        //
        //  If there is an encryption context then transform the data.
        //

        if ((Scb->EncryptionContext != NULL) &&
            (NtfsData.EncryptionCallBackTable.AfterReadProcess != NULL)) {

            ASSERT ( NtfsIsTypeCodeEncryptible( Scb->AttributeTypeCode ) );

            //
            //  If the compression context has a buffer then we will use that.
            //

            if (CompressionContext->CompressionBuffer != NULL) {

                SystemBuffer = CompressionContext->CompressionBuffer;

            } else {

                SystemBuffer = NtfsMapUserBuffer( Irp, NormalPagePriority );
            }

            //
            //  Now look at each run of real data heading coming from the disk and
            //  let the encryption driver decrypt it.
            //

            for ( Run = 0; Run < NumberRuns; Run++ ) {

                OffsetWithinFile.QuadPart = CompressionContext->IoRuns[Run].StartingVbo;

                Status = NtfsData.EncryptionCallBackTable.AfterReadProcess(
                                                Add2Ptr(SystemBuffer, CompressionContext->IoRuns[Run].BufferOffset),
                                                &OffsetWithinFile,
                                                CompressionContext->IoRuns[Run].ByteCount,
                                                Scb->EncryptionContext);

                if (!NT_SUCCESS( Status )) {

                    return Status;
                }
            }

            if (!NT_SUCCESS( Status )) {

                return Status;
            }
        }

        //
        //  There may be a compression unit but there is no completion to do
        //  i.e this is an uncompressed sparse file.
        //  We might be operating on an encrypted file as well.
        //  In either case just exit if the file is not compressed.
        //

        if (!FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK )) {

            if (SystemBuffer != NULL) {

                KeFlushIoBuffers( Irp->MdlAddress, TRUE, FALSE );
            }

            return STATUS_SUCCESS;
        }

        ASSERT( Scb->CompressionUnit != 0 );
        if (!FlagOn( StreamFlags, COMPRESSED_STREAM )) {

            //
            //  Initialize remaining context for the loop.
            //

            CompressionUnit = Scb->CompressionUnit;
            CompressionUnitInClusters = ClustersFromBytes(Vcb, CompressionUnit);
            CompressedOffset = 0;
            UncompressedOffset = 0;
            Status = STATUS_SUCCESS;

            //
            //  Map the user buffer.
            //

            SystemBuffer = (PVOID)((PCHAR)CompressionContext->SystemBuffer +
                                          CompressionContext->SystemBufferOffset);


            //
            //  Calculate the first Vcn and offset within the compression
            //  unit of the start of the transfer, and lookup the first
            //  run.
            //

            StartingOffset = *((PULONG)StartingVbo) & (CompressionUnit - 1);
            CurrentVcn = LlClustersFromBytes(Vcb, *StartingVbo - StartingOffset);

            NextIsAllocated =
            NtfsLookupAllocation( IrpContext,
                                  Scb,
                                  CurrentVcn,
                                  &NextLcn,
                                  &CurrentAllocatedClusterCount,
                                  NULL,
                                  NULL );

            //
            //  Set NextIsAllocated and NextLcn as the Mcb package would, to show if
            //  we are off the end.
            //

            if (!NextIsAllocated) {
                NextLcn = UNUSED_LCN;
            }

            NextIsAllocated = (BOOLEAN)(CurrentAllocatedClusterCount < (MAXLONGLONG - CurrentVcn));

            //
            //  If this is actually a hole or there was no entry in the Mcb, then
            //  set CurrentAllocatedClusterCount to zero so we will always make the first
            //  pass in the embedded while loop below.
            //

            if (!NextIsAllocated || (NextLcn == UNUSED_LCN)) {
                CurrentAllocatedClusterCount = 0;
            }

            //
            //  Prepare for the initial Mcb scan below by pretending that the
            //  next run has been looked up, and is a contiguous run of 0 clusters!
            //

            NextVcn = CurrentVcn + CurrentAllocatedClusterCount;
            NextClusterCount = 0;

            //
            //  Remember the last Vcn we should look up.
            //

            BeyondLastVcn = BlockAlign( *StartingVbo + ByteCount, (LONG)CompressionUnit );
            BeyondLastVcn = LlClustersFromBytesTruncate( Vcb, BeyondLastVcn );

            //
            //  Loop to return the data.
            //

            while (ByteCount != 0) {

                //
                //  Loop to determine the compressed size of the next compression
                //  unit.  I.e., loop until we either find the end of the current
                //  range of contiguous Vcns, or until we find that the current
                //  compression unit is fully allocated.
                //

                while (NextIsAllocated &&
                       (CurrentAllocatedClusterCount < CompressionUnitInClusters) &&
                       ((CurrentVcn + CurrentAllocatedClusterCount) == NextVcn)) {

                    if ((CurrentVcn + CurrentAllocatedClusterCount) > NextVcn) {

                        NtfsRaiseStatus( IrpContext, STATUS_FILE_CORRUPT_ERROR, NULL, Scb->Fcb );
                    }

                    CurrentAllocatedClusterCount = CurrentAllocatedClusterCount + NextClusterCount;

                    //
                    //  Loop to find the next allocated Vcn, or the end of the Mcb.
                    //  None of the interfaces using RangePtr and Index as inputs
                    //  can be used here, such as NtfsGetSequentialMcbEntry, because
                    //  we do not have the Scb main resource acquired, and writers can
                    //  be moving stuff around in parallel.
                    //

                    while (TRUE) {

                        //
                        //  Set up NextVcn for next call
                        //

                        NextVcn += NextClusterCount;

                        //
                        //  Exit if we are past the end of the range being decompressed.
                        //

                        if (NextVcn >= BeyondLastVcn) {

                            NextIsAllocated = TRUE;
                            break;
                        }

                        NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                                Scb,
                                                                NextVcn,
                                                                &NextLcn,
                                                                &NextClusterCount,
                                                                NULL,
                                                                NULL );

                        //
                        //  Set NextIsAllocated and NextLcn as the Mcb package would, to show if
                        //  we are off the end.
                        //

                        if (!NextIsAllocated) {
                            NextLcn = UNUSED_LCN;
                        }

                        NextIsAllocated = (BOOLEAN)(NextClusterCount < (MAXLONGLONG - NextVcn));

                        //
                        //  Get out if we hit the end or see something allocated.
                        //

                        if (!NextIsAllocated || (NextLcn != UNUSED_LCN)) {
                            break;
                        }
                    }
                }

                //
                //  The compression unit is fully allocated.
                //

                if (CurrentAllocatedClusterCount >= CompressionUnitInClusters) {

                    CompressedSize = CompressionUnit;
                    CurrentAllocatedClusterCount = CurrentAllocatedClusterCount - CompressionUnitInClusters;

                //
                //  Otherwise calculate how much is allocated at the current Vcn
                //  (if any).
                //

                } else {

                    CompressedSize = BytesFromClusters(Vcb, (ULONG)CurrentAllocatedClusterCount);
                    CurrentAllocatedClusterCount = 0;
                }

                //
                //  The next time through this loop, we will be working on the next
                //  compression unit.
                //

                CurrentVcn = CurrentVcn + CompressionUnitInClusters;

                //
                //  Calculate uncompressed size of the desired fragment, or
                //  entire compression unit.
                //

                NtfsAcquireFsrtlHeader( Scb );
                UncompressedSize = Scb->Header.FileSize.QuadPart -
                                   (*StartingVbo + UncompressedOffset);
                NtfsReleaseFsrtlHeader( Scb );

                if (UncompressedSize > CompressionUnit) {
                    (ULONG)UncompressedSize = CompressionUnit;
                }

                //
                //  Calculate how much we want now, based on StartingOffset and
                //  ByteCount.
                //

                NextByteCount = CompressionUnit - StartingOffset;
                if (NextByteCount > ByteCount) {
                    NextByteCount = ByteCount;
                }

                //
                //  Practice safe access
                //

                try {

                    //
                    //  There were no clusters allocated, return 0's.
                    //

                    AlreadyFilled = FALSE;
                    if (CompressedSize == 0) {

                        RtlZeroMemory( (PUCHAR)SystemBuffer + UncompressedOffset,
                                       NextByteCount );

                    //
                    //  The compression unit was fully allocated, just copy.
                    //

                    } else if (CompressedSize == CompressionUnit) {

                        RtlCopyMemory( (PUCHAR)SystemBuffer + UncompressedOffset,
                                       CompressionContext->CompressionBuffer +
                                         CompressedOffset + StartingOffset,
                                       NextByteCount );

                    //
                    //  Caller does not want the entire compression unit, decompress
                    //  a fragment.
                    //

                    } else if (NextByteCount < CompressionUnit) {

                        //
                        //  If we have not already allocated the workspace, then do it.
                        //

                        if (CompressionContext->WorkSpace == NULL) {
                            ULONG CompressWorkSpaceSize;
                            ULONG FragmentWorkSpaceSize;

                            ASSERT((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) != 0);

                            (VOID) RtlGetCompressionWorkSpaceSize( (USHORT)((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) + 1),
                                                                   &CompressWorkSpaceSize,
                                                                   &FragmentWorkSpaceSize );

                            //
                            //  Allocate first from non-paged, then paged.  The typical
                            //  size of this workspace is just over a single page so
                            //  if both allocations fail then the system is running
                            //  a reduced capacity.  Return an error to the user
                            //  and let him retry.
                            //

                            CompressionContext->WorkSpace = NtfsAllocatePoolWithTagNoRaise( NonPagedPool, FragmentWorkSpaceSize, 'wftN' );

                            if (CompressionContext->WorkSpace == NULL) {

                                CompressionContext->WorkSpace =
                                    NtfsAllocatePool( PagedPool, FragmentWorkSpaceSize );
                            }
                        }

                        while (TRUE) {

                            Status =
                            RtlDecompressFragment( (USHORT)((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) + 1),
                                                   (PUCHAR)SystemBuffer + UncompressedOffset,
                                                   NextByteCount,
                                                   CompressionContext->CompressionBuffer + CompressedOffset,
                                                   CompressedSize,
                                                   StartingOffset,
                                                   (PULONG)&UncompressedSize,
                                                   CompressionContext->WorkSpace );

                            ASSERT(NT_SUCCESS( Status ) || !NtfsStopOnDecompressError);

                            if (NT_SUCCESS(Status)) {

                                RtlZeroMemory( (PUCHAR)SystemBuffer + UncompressedOffset + (ULONG)UncompressedSize,
                                               NextByteCount - (ULONG)UncompressedSize );
                                break;

                            } else {

                                //
                                //  The compressed buffer could have been bad.  We need to fill
                                //  it with a pattern and get on with life.  Someone could be
                                //  faulting it in just to overwrite it, or it could be a rare
                                //  case of corruption.  We fill the data with a pattern, but
                                //  we must return success so a pagefault will succeed.  We
                                //  do this once, then loop back to decompress what we can.
                                //

                                Status = STATUS_SUCCESS;

                                if (!AlreadyFilled) {

                                    RtlFillMemory( (PUCHAR)SystemBuffer + UncompressedOffset,
                                                   NextByteCount,
                                                   0xDF );
                                    AlreadyFilled = TRUE;

                                } else {
                                    break;
                                }
                            }
                        }

                    //
                    //  Decompress the entire compression unit.
                    //

                    } else {

                        ASSERT( StartingOffset == 0 );

                        while (TRUE) {

                            Status =
                            RtlDecompressBuffer( (USHORT)((Scb->AttributeFlags & ATTRIBUTE_FLAG_COMPRESSION_MASK) + 1),
                                                 (PUCHAR)SystemBuffer + UncompressedOffset,
                                                 NextByteCount,
                                                 CompressionContext->CompressionBuffer + CompressedOffset,
                                                 CompressedSize,
                                                 (PULONG)&UncompressedSize );

                            ASSERT(NT_SUCCESS( Status ) || !NtfsStopOnDecompressError);

                            if (NT_SUCCESS(Status)) {

                                RtlZeroMemory( (PUCHAR)SystemBuffer + UncompressedOffset + (ULONG)UncompressedSize,
                                               NextByteCount - (ULONG)UncompressedSize );
                                break;

                            } else {

                                //
                                //  The compressed buffer could have been bad.  We need to fill
                                //  it with a pattern and get on with life.  Someone could be
                                //  faulting it in just to overwrite it, or it could be a rare
                                //  case of corruption.  We fill the data with a pattern, but
                                //  we must return success so a pagefault will succeed.  We
                                //  do this once, then loop back to decompress what we can.
                                //

                                Status = STATUS_SUCCESS;

                                if (!AlreadyFilled) {

                                    RtlFillMemory( (PUCHAR)SystemBuffer + UncompressedOffset,
                                                   NextByteCount,
                                                   0xDB );
                                    AlreadyFilled = TRUE;

                                } else {
                                    break;
                                }
                            }
                        }
                    }

                //
                //  If its an unexpected error then
                //  Probably Gary's decompression routine faulted, but blame it on
                //  the user buffer!
                //

                } except(NtfsCompressionFilter(IrpContext, GetExceptionInformation())) {

                      Status = GetExceptionCode();
                      if (!FsRtlIsNtstatusExpected( Status )) {
                          Status = STATUS_INVALID_USER_BUFFER;
                      }
                }

                if (!NT_SUCCESS(Status)) {
                    break;
                }

                //
                //  Advance these fields for the next pass through.
                //

                StartingOffset = 0;
                UncompressedOffset += NextByteCount;
                CompressedOffset += CompressedSize;
                ByteCount -= NextByteCount;
            }

            //
            //  We now flush the user's buffer to memory.
            //

            KeFlushIoBuffers( CompressionContext->SavedMdl, TRUE, FALSE );
        }

    //
    //  For compressed writes we just checkpoint the transaction and
    //  free all snapshots and resources, then get the Scb back.  Only do this if the
    //  request is for the same Irp as the original Irp.  We don't want to checkpoint
    //  if called from NtfsWriteClusters.
    //

    } else if (Irp == IrpContext->OriginatingIrp) {

        if (CompressionContext->ScbAcquired) {

            BOOLEAN Reinsert = FALSE;

            NtfsCheckpointCurrentTransaction( IrpContext );

            //
            //  We want to empty the exclusive Fcb list but still hold
            //  the current file.  Go ahead and remove it from the exclusive
            //  list and reinsert it after freeing the other entries.
            //

            while (!IsListEmpty(&IrpContext->ExclusiveFcbList)) {

                //
                //  If this is the Scb for this Fcb then remove it from the list.
                //  We have to preserve the number of times this Fcb may have been
                //  acquired outside of PrepareBuffers.
                //

                if ((PFCB)CONTAINING_RECORD( IrpContext->ExclusiveFcbList.Flink,
                                             FCB,
                                             ExclusiveFcbLinks ) == Scb->Fcb) {

                    RemoveEntryList( &Scb->Fcb->ExclusiveFcbLinks );
                    Reinsert = TRUE;

                } else {

                    NtfsReleaseFcb( IrpContext,
                                    (PFCB)CONTAINING_RECORD(IrpContext->ExclusiveFcbList.Flink,
                                                            FCB,
                                                            ExclusiveFcbLinks ));
                }
            }

            ClearFlag( IrpContext->Flags, IRP_CONTEXT_FLAG_RELEASE_USN_JRNL |
                                          IRP_CONTEXT_FLAG_RELEASE_MFT );

            if (Reinsert) {

                InsertHeadList( &IrpContext->ExclusiveFcbList,
                                &Scb->Fcb->ExclusiveFcbLinks );

                //
                //  Release the Scb if we acquired it in PrepareBuffers.  It is
                //  important that we have released the Scb before going back
                //  and faulting into the data section.  Otherwise we could
                //  hit a collided page fault deadlock.
                //

                NtfsReleaseScb( IrpContext, Scb );
                CompressionContext->ScbAcquired = FALSE;
            }
        }
    }

    return Status;
}


PMDL
NtfsLockFileRange (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB Scb,
    IN LONGLONG Offset,
    IN ULONG Length
    )

/*++

Routine Description:

    This function maps the given range of file into the cachemanager space and
    then probes and locks it down

Arguments:

    Scb - Supplies the stream file to act on.

    Offset - The starting point to be probed and locked

    Length - The lengh of the operation.

Return Value:

    PMDL - a mdl representing the locked area - this mdl must be unlocked and freed by the caller

--*/

{
    NTSTATUS Status;
    PBCB Bcb;
    PVOID Buffer;
    PMDL Mdl = NULL;

    //
    //  File must be cached
    //

    ASSERT( Scb->FileObject != NULL);

    //
    //  Map the offset into the address space
    //

    CcMapData( Scb->FileObject, (PLARGE_INTEGER)&Offset, Length, TRUE, &Bcb, &Buffer );

#ifdef MAPCOUNT_DBG
    IrpContext->MapCount++;
#endif

    //
    //  Lock the data into memory Don't tell Mm here that we plan to write it, as he sets
    //  dirty now and at the unlock below if we do.
    //

    try {

        //
        //  Now attempt to allocate an Mdl to describe the mapped data.
        //

        Mdl = IoAllocateMdl( Buffer, Length, FALSE, FALSE, NULL );

        if (Mdl == NULL) {
            NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
        }

        MmProbeAndLockPages( Mdl, KernelMode, IoReadAccess );

    //
    //  Catch any raises here and clean up appropriately.
    //

    } except(EXCEPTION_EXECUTE_HANDLER) {

        Status = GetExceptionCode();

        CcUnpinData( Bcb );

#ifdef MAPCOUNT_DBG
        IrpContext->MapCount--;
#endif

        if (Mdl != NULL) {

            IoFreeMdl( Mdl );
            Mdl = NULL;
        }

        NtfsRaiseStatus( IrpContext,
                         FsRtlIsNtstatusExpected(Status) ? Status : STATUS_UNEXPECTED_IO_ERROR,
                         NULL,
                         NULL );
    }

    CcUnpinData( Bcb );

#ifdef MAPCOUNT_DBG
    IrpContext->MapCount--;
#endif

    return Mdl;
}


VOID
NtfsZeroEndOfSector (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN LONGLONG Offset,
    IN BOOLEAN Cached
    )
/*++

Routine Description:

    This function zeroes from the given offset to the next sector boundary directly
    onto disk.  Particularly if the file is cached the caller must synchronize in some fashion
    to prevent the sector from being written at the same time through other paths.  I.e
    own ioateof or paging exclusive.  Also this only be called with non sparse / non compressed files


Arguments:

    Scb - Supplies the stream file to act on.

    Offset - The starting offset to zero to its sector boundary

Return Value:

    None - raises on error

--*/
{
    PVCB Vcb = Scb->Fcb->Vcb;
    ULONG BufferLength = Vcb->BytesPerSector;
    PMDL Mdl = NULL;
    PMDL OriginalMdl = Irp->MdlAddress;
    PVOID Buffer = NULL;
    LCN Lcn;
    LONGLONG ClusterCount;
    LONGLONG LogicalOffset;
    LONGLONG VirtualOffset;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );
    NTSTATUS Status;
    LOGICAL Wait = FlagOn( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

    ASSERT( !FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK ) );

    //
    //  Decide whether to use cached or noncached path
    //

    if (Cached) {

        NtfsCreateMdlAndBuffer( IrpContext,
                                Scb,
                                RESERVED_BUFFER_ONE_NEEDED,
                                &BufferLength,
                                &Mdl,
                                &Buffer );

        try {

            RtlZeroMemory( Buffer, Vcb->BytesPerSector - (LONG)(Offset % Vcb->BytesPerSector) );
            CcCopyWrite( IrpSp->FileObject, (PLARGE_INTEGER)&Offset, Vcb->BytesPerSector - (LONG)(Offset % Vcb->BytesPerSector), TRUE, Buffer );

        } finally {
            NtfsDeleteMdlAndBuffer( Mdl, Buffer );
        }

    } else {

        //
        //  Find the lcn that contains the cluster in question
        //

        if (NtfsLookupAllocation( IrpContext, Scb, LlClustersFromBytesTruncate( Vcb, Offset ), &Lcn, &ClusterCount, NULL, NULL )) {

            try {

                //
                //  Set calls to be temp. synchronous
                //

                SetFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

                NtfsCreateMdlAndBuffer( IrpContext,
                                        Scb,
                                        RESERVED_BUFFER_ONE_NEEDED,
                                        &BufferLength,
                                        &Mdl,
                                        &Buffer );
                Irp->MdlAddress = Mdl;

                //
                //  The logical offset on disk is at the lcn we found + the offset within that cluster of the
                //  offset rounded down to the nearest sector
                //

                LogicalOffset = LlBytesFromClusters( Vcb, Lcn ) + Offset - BlockAlignTruncate( Offset, (LONG)Vcb->BytesPerCluster );
                LogicalOffset = BlockAlignTruncate( LogicalOffset, (LONG)Vcb->BytesPerSector );

                //
                //  First read the sector
                //

                NtfsSingleAsync( IrpContext,
                                 Vcb->TargetDeviceObject,
                                 LogicalOffset,
                                 Vcb->BytesPerSector,
                                 Irp,
                                 IRP_MJ_READ,
                                 0 );

                NtfsWaitSync( IrpContext );

                NtfsNormalizeAndCleanupTransaction( IrpContext,
                                                    &Irp->IoStatus.Status,
                                                    TRUE,
                                                    STATUS_UNEXPECTED_IO_ERROR );

                //
                //  Decrypt the buffer if its encrypted
                //

                if ((Scb->EncryptionContext != NULL) &&
                    (NtfsData.EncryptionCallBackTable.AfterReadProcess != NULL)) {


                    VirtualOffset = BlockAlignTruncate( Offset, (LONG)Vcb->BytesPerSector );

                    Status = NtfsData.EncryptionCallBackTable.AfterReadProcess( Buffer,
                                                                                (PLARGE_INTEGER)&VirtualOffset,
                                                                                Vcb->BytesPerSector,
                                                                                Scb->EncryptionContext );

                    if (!NT_SUCCESS( Status )) {

                        NtfsRaiseStatus( IrpContext, Status, &Scb->Fcb->FileReference, Scb->Fcb );
                    }
                }

                //
                //  Clear return info field
                //

                Irp->IoStatus.Information = 0;

                //
                //  Zero out the remainder of the sector
                //

                RtlZeroMemory( Add2Ptr( Buffer, (LONG)(Offset % Vcb->BytesPerSector )),  Vcb->BytesPerSector - (LONG)(Offset % Vcb->BytesPerSector) );


                //
                //  Re-ecrypt the buffer if its encrypted
                //

                if ((Scb->EncryptionContext != NULL) &&
                    (NtfsData.EncryptionCallBackTable.BeforeWriteProcess != NULL)) {

                    Status = NtfsData.EncryptionCallBackTable.BeforeWriteProcess( Buffer,
                                                                                  Buffer,
                                                                                  (PLARGE_INTEGER)&VirtualOffset,
                                                                                  Vcb->BytesPerSector,
                                                                                  Scb->EncryptionContext );
                    if (!NT_SUCCESS( Status )) {

                        NtfsRaiseStatus( IrpContext, Status, &Scb->Fcb->FileReference, Scb->Fcb );
                    }
                }

                //
                //  Rewrite the sector back down
                //

                NtfsSingleAsync( IrpContext,
                                 Vcb->TargetDeviceObject,
                                 LogicalOffset,
                                 Vcb->BytesPerSector,
                                 Irp,
                                 IRP_MJ_WRITE,
                                 0 );

                NtfsWaitSync( IrpContext );

            } finally {

                //
                //  Reset to original wait state
                //

                if (!Wait) {
                    ClearFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT );
                }

                NtfsDeleteMdlAndBuffer( Mdl, Buffer );
                Irp->MdlAddress = OriginalMdl;
            }
        }

    }

    return;
}


NTSTATUS
NtfsNonCachedIo (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN ULONG ByteCount,
    IN ULONG StreamFlags
    )

/*++

Routine Description:

    This routine performs the non-cached disk io described in its parameters.
    The choice of a single run is made if possible, otherwise multiple runs
    are executed.

    Sparse files are supported.  If "holes" are encountered, then the user
    buffer is zeroed over the specified range.  This should only happen on
    reads during normal operation, but it can also happen on writes during
    restart, in which case it is also appropriate to zero the buffer.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Supplies the stream file to act on.

    StartingVbo - The starting point for the operation.

    ByteCount - The lengh of the operation.

    StreamFlags - Supplies either 0 or some combination of COMPRESSED_STREAM
                  and ENCRYPTED_STREAM

Return Value:

    None.

--*/

{
    ULONG OriginalByteCount, RemainingByteCount;
    ULONG NumberRuns;
    IO_RUN IoRuns[NTFS_MAX_PARALLEL_IOS];
    COMPRESSION_CONTEXT CompressionContext;
    NTSTATUS Status = STATUS_SUCCESS;
    PMDL Mdl = NULL;
    LONGLONG LfsStartingVbo;

    PVCB Vcb = Scb->Fcb->Vcb;

    BOOLEAN Wait;
    UCHAR IrpSpFlags = 0;

#ifdef PERF_STATS
    BOOLEAN CreateNewFile = FALSE;
    BOOLEAN TrackIos = FALSE;
    LARGE_INTEGER StartIo;
    LARGE_INTEGER Now;
    PTOP_LEVEL_CONTEXT TopLevelContext = NtfsGetTopLevelContext();
#endif

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsNonCachedIo\n") );
    DebugTrace( 0, Dbg, ("Irp           = %08lx\n", Irp) );
    DebugTrace( 0, Dbg, ("MajorFunction = %08lx\n", IrpContext->MajorFunction) );
    DebugTrace( 0, Dbg, ("Scb           = %08lx\n", Scb) );
    DebugTrace( 0, Dbg, ("StartingVbo   = %016I64x\n", StartingVbo) );
    DebugTrace( 0, Dbg, ("ByteCount     = %08lx\n", ByteCount) );

    //
    //  Initialize some locals.
    //

    OriginalByteCount = ByteCount;

    Wait = (BOOLEAN) FlagOn( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

    //
    //  Check if we need to do sequential writes.
    //

    if ((IrpContext->MajorFunction == IRP_MJ_WRITE) &&
        FlagOn( Scb->ScbState, SCB_STATE_MODIFIED_NO_WRITE )) {

        IrpSpFlags = SL_FT_SEQUENTIAL_WRITE | SL_WRITE_THROUGH;
    }

#ifdef PERF_STATS
    {
        if ((ULONG_PTR)TopLevelContext >= FSRTL_MAX_TOP_LEVEL_IRP_FLAG &&
            (ULONG_PTR)TopLevelContext != (FSRTL_CACHE_TOP_LEVEL_IRP | 0x80000000)) {

            if (TopLevelContext->SavedTopLevelIrp &&
                (ULONG_PTR)TopLevelContext->SavedTopLevelIrp >= FSRTL_MAX_TOP_LEVEL_IRP_FLAG &&
                (ULONG_PTR)TopLevelContext->SavedTopLevelIrp != (FSRTL_CACHE_TOP_LEVEL_IRP | 0x80000000) &&
                (((PTOP_LEVEL_CONTEXT)TopLevelContext->SavedTopLevelIrp)->Ntfs == 0x5346544e)) {

                TopLevelContext = (PTOP_LEVEL_CONTEXT) TopLevelContext->SavedTopLevelIrp;
            }

            if ((TopLevelContext->ThreadIrpContext->MajorFunction == IRP_MJ_CREATE) &&
                (TopLevelContext->ThreadIrpContext->MinorFunction == IRP_MN_CREATE_NEW)) {

                CreateNewFile = TRUE;
            }

            if (FlagOn( TopLevelContext->ThreadIrpContext->State, IRP_CONTEXT_STATE_TRACK_IOS )) {
                TrackIos = TRUE;
            }
        }
    }

#endif

    //
    //  Prepare the (first set) of buffers for I/O.
    //

    RtlZeroMemory( &CompressionContext, sizeof(COMPRESSION_CONTEXT) );
    CompressionContext.IoRuns = IoRuns;
    CompressionContext.AllocatedRuns = NTFS_MAX_PARALLEL_IOS;
    CompressionContext.FinishBuffersNeeded =
        ((Scb->CompressionUnit != 0) || (Scb->EncryptionContext != NULL)) &&
        !FlagOn( StreamFlags, ENCRYPTED_STREAM );

    try {

        //
        //  If this is a write to a compressed file, we want to make sure here
        //  that any fragments of compression units get locked in memory, so
        //  no one will be reading them into the cache while we are mucking with
        //  the Mcb, etc.  We do this right here at the top so that we have
        //  more stack(!), and we get this over with before we have to acquire
        //  the Scb exclusive.
        //

        if ((IrpContext->MajorFunction == IRP_MJ_WRITE) &&
            (Scb->CompressionUnit != 0) &&
            FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK )) {

            LONGLONG TempOffset;
            LONGLONG TempRange;
            ULONG CompressionUnit = Scb->CompressionUnit;

#ifdef  COMPRESS_ON_WIRE

            //
            //  For a compressed stream, just make sure the stream exists.
            //

            if (FlagOn( StreamFlags, COMPRESSED_STREAM )) {

                if (Scb->Header.FileObjectC == NULL) {
                    NtfsCreateInternalCompressedStream( IrpContext, Scb, FALSE, NULL );

                    //
                    //  If there is no one who will cause this stream to
                    //  be dereferenced then add an entry on the delayed
                    //  close queue for this.  We can do this test without
                    //  worrying about synchronization since it is OK to have
                    //  an extra entry in the delayed queue.
                    //

                    if ((Scb->CleanupCount == 0) &&
                        (Scb->Fcb->DelayedCloseCount == 0)) {

                        NtfsAddScbToFspClose( IrpContext, Scb, TRUE );
                    }
                }
            //
            //  This better be paging I/O, because we ignore the caller's buffer
            //  and write the entire compression unit out of the section.
            //
            //  We don't want to map in the data in the case where we are called
            //  from write clusters because MM is creating the section for the
            //  file.  Otherwise we will deadlock when Cc tries to create the
            //  section.
            //

            }

#endif

            if (

#ifdef  COMPRESS_ON_WIRE
                !FlagOn( StreamFlags, COMPRESSED_STREAM ) &&
#endif
                ((Irp == IrpContext->OriginatingIrp) ||
                 (Scb->NonpagedScb->SegmentObject.SharedCacheMap != NULL))) {

                PMDL *TempMdl;

                if (Scb->FileObject == NULL) {
                    NtfsCreateInternalAttributeStream( IrpContext,
                                                       Scb,
                                                       FALSE,
                                                       &NtfsInternalUseFile[NONCACHEDIO_FILE_NUMBER] );

                    //
                    //  If there is no one who will cause this stream to
                    //  be dereferenced then add an entry on the delayed
                    //  close queue for this.  We can do this test without
                    //  worrying about synchronization since it is OK to have
                    //  an extra entry in the delayed queue.
                    //

                    if ((Scb->CleanupCount == 0) &&
                        (Scb->Fcb->DelayedCloseCount == 0)) {

                        NtfsAddScbToFspClose( IrpContext, Scb, TRUE );
                    }
                }

                //
                //  Lock the entire range rounded to its compression unit boundaries
                //  First round the start of the range down to a compression unit and then
                //  round the top of the range up to one
                //

                TempOffset = BlockAlignTruncate( StartingVbo, (LONG)CompressionUnit );
                TempRange = BlockAlign( StartingVbo + ByteCount, (LONG)CompressionUnit );

                TempMdl = &Mdl;

                do {

                    LONGLONG MapBoundary;
                    ULONG Range;

                    MapBoundary = BlockAlign( TempOffset + 1, VACB_MAPPING_GRANULARITY );
                    Range = (ULONG) min( TempRange - TempOffset, MapBoundary - TempOffset );

                    *TempMdl = NtfsLockFileRange( IrpContext,
                                                  Scb,
                                                  TempOffset,
                                                  Range );

                    TempOffset += Range;
                    TempMdl = &((*TempMdl)->Next );

                } while ( TempOffset != TempRange );

            } else {

                //
                //  This had better be a convert to non-resident.
                //

                ASSERT( StartingVbo == 0 );
                ASSERT( ByteCount <= Scb->CompressionUnit );
            }
        }

        //
        //  Check if need to trim the write for the log file.
        //

        if ((PAGE_SIZE != LFS_DEFAULT_LOG_PAGE_SIZE) &&
            (Scb == Vcb->LogFileScb) &&
            (IrpContext->MajorFunction == IRP_MJ_WRITE)) {

            LfsStartingVbo = StartingVbo;
            LfsCheckWriteRange( &Vcb->LfsWriteData, &LfsStartingVbo, &ByteCount );

            //
            //  If the byte count is now zero then exit this routine.
            //

            if (ByteCount == 0) {

                Irp->IoStatus.Status = STATUS_SUCCESS;
                Irp->IoStatus.Information = ByteCount;
                DebugTrace( -1, Dbg, ("NtfsNonCachedIo -> %08lx\n", Irp->IoStatus.Status) );
                try_return( Status = Irp->IoStatus.Status );
            }

            //
            //  Adjust the buffer offset in the compression context if necessary.
            //

            CompressionContext.SystemBufferOffset = (ULONG) (LfsStartingVbo - StartingVbo);
            StartingVbo = LfsStartingVbo;
        }

        RemainingByteCount = NtfsPrepareBuffers( IrpContext,
                                                 Irp,
                                                 Scb,
                                                 &StartingVbo,
                                                 ByteCount,
                                                 StreamFlags,
                                                 &Wait,
                                                 &NumberRuns,
                                                 &CompressionContext );

        //
        //  If we are writing to an encrypted stream, now is the
        //  time to do the encryption, before we pass the buffer
        //  down to the disk driver below us.
        //

        if ((Scb->EncryptionContext != NULL) &&
            (IrpContext->MajorFunction == IRP_MJ_WRITE) &&
            (NtfsData.EncryptionCallBackTable.BeforeWriteProcess != NULL) &&
            (!FlagOn( StreamFlags, ENCRYPTED_STREAM ))) {

            ASSERT ( NtfsIsTypeCodeEncryptible( Scb->AttributeTypeCode ) );
            ASSERT( NumberRuns > 0 );

            NtfsEncryptBuffers( IrpContext,
                                Irp,
                                Scb,
                                StartingVbo,
                                NumberRuns,
                                &CompressionContext );
        }

        ASSERT( RemainingByteCount < ByteCount );

        if (FlagOn(Irp->Flags, IRP_PAGING_IO)) {
            CollectDiskIoStats(Vcb, Scb, IrpContext->MajorFunction, NumberRuns);
        }

        //
        //  See if the write covers a single valid run, and if so pass
        //  it on.  Notice that if there is a single run but it does not
        //  begin at the beginning of the buffer then we will still need to
        //  allocate an associated Irp for this.
        //

        if ((RemainingByteCount == 0) &&
            (((NumberRuns == 1) &&
              (CompressionContext.IoRuns[0].BufferOffset == 0)) ||

            (NumberRuns == 0))) {

            DebugTrace( 0, Dbg, ("Passing Irp on to Disk Driver\n") );

            //
            //  See if there is an allocated run
            //

            if (NumberRuns == 1) {

                DebugTrace( 0, Dbg, ("One run\n") );

                //
                //  Now set up the Irp->IoStatus.  It will be modified by the
                //  completion routine in case of error or verify required.
                //

                Irp->IoStatus.Status = STATUS_SUCCESS;

                //
                //  We will continously try the I/O if we get a verify required
                //  back and can verify the volume
                //

                while (TRUE) {

                    //
                    //  Do the I/O and wait for it to finish
                    //

#ifdef PERF_STATS
                    if (TrackIos) {
                        TopLevelContext->ThreadIrpContext->Ios += 1;
                    }

                    if (CreateNewFile) {

                        InterlockedIncrement( &IrpContext->Vcb->IosPerCreates );
                        //KeQueryTickCount( &StartIo );
                        StartIo = KeQueryPerformanceCounter( NULL );
                    }
#endif

                    NtfsSingleAsync( IrpContext,
                                     Vcb->TargetDeviceObject,
                                     CompressionContext.IoRuns[0].StartingLbo,
                                     CompressionContext.IoRuns[0].ByteCount,
                                     Irp,
                                     IrpContext->MajorFunction,
                                     IrpSpFlags );

                    //
                    //  If this is an asynch transfer we return STATUS_PENDING.
                    //

                    if (!Wait) {

                        DebugTrace( -1, Dbg, ("NtfsNonCachedIo -> STATUS_PENDING\n") );
                        try_return(Status = STATUS_PENDING);

                    } else {

                        NtfsWaitSync( IrpContext );

#ifdef PERF_STATS
                        if (CreateNewFile) {

                            //KeQueryTickCount( &Now );
                            Now = KeQueryPerformanceCounter( NULL );
                            IrpContext->Vcb->TimePerCreateIos += Now.QuadPart - StartIo.QuadPart;

                        }
#endif

                    }

                    //
                    //  If we didn't get a verify required back then break out of
                    //  this loop
                    //

                    if (Irp->IoStatus.Status != STATUS_VERIFY_REQUIRED) { break; }

                    //
                    //  Otherwise we need to verify the volume, and if it doesn't
                    //  verify correctly the we dismount the volume and raise our
                    //  error
                    //

                    if (!NtfsPerformVerifyOperation( IrpContext, Vcb )) {

                        //**** NtfsPerformDismountOnVcb( IrpContext, Vcb, TRUE, NULL );
                        ClearFlag( Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED );

                        NtfsRaiseStatus( IrpContext, STATUS_FILE_INVALID, NULL, NULL );
                    }

                    //
                    //  The volume verified correctly so now clear the verify bit
                    //  and try and I/O again
                    //

                    ClearFlag( Vcb->Vpb->RealDevice->Flags, DO_VERIFY_VOLUME );

                    //
                    //  Reset the status before retrying.
                    //

                    Irp->IoStatus.Status = STATUS_SUCCESS;
                }

                //
                //  See if we need to do a hot fix.  Hotfix if the request failed
                //  (except if called from WriteClusters) or we couldn't revert
                //  a USA block.
                //

                if ((!FT_SUCCESS( Irp->IoStatus.Status ) &&
                     ((IrpContext->MajorFunction != IRP_MJ_WRITE) ||
                      (Irp == IrpContext->OriginatingIrp))) ||
                    (FlagOn(Scb->ScbState, SCB_STATE_USA_PRESENT) &&
                     (IrpContext->MajorFunction == IRP_MJ_READ) &&
                     !NtfsVerifyAndRevertUsaBlock( IrpContext,
                                                   Scb,
                                                   Irp,
                                                   NULL,
                                                   0,
                                                   OriginalByteCount,
                                                   StartingVbo ))) {

                    //
                    //  Try to fix the problem
                    //

                    NtfsFixDataError( IrpContext,
                                      Scb,
                                      Vcb->TargetDeviceObject,
                                      Irp,
                                      1,
                                      CompressionContext.IoRuns,
                                      IrpSpFlags );
                }

            //
            //  Show that we successfully read zeros for the deallocated range.
            //

            } else {

                Irp->IoStatus.Status = STATUS_SUCCESS;
                Irp->IoStatus.Information = ByteCount;
            }

            DebugTrace( -1, Dbg, ("NtfsNonCachedIo -> %08lx\n", Irp->IoStatus.Status) );
            try_return( Status = Irp->IoStatus.Status );
        }

        //
        //  If there are bytes remaining and we cannot wait, then we must
        //  post this request unless we are doing paging io.
        //

        if (!Wait && (RemainingByteCount != 0)) {

            if (!FlagOn( Irp->Flags, IRP_PAGING_IO )) {

                NtfsRaiseStatus( IrpContext, STATUS_CANT_WAIT, NULL, NULL );
            }

            Wait = TRUE;
            SetFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

            ClearFlag( IrpContext->Union.NtfsIoContext->Flags, NTFS_IO_CONTEXT_ASYNC );
            KeInitializeEvent( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent,
                               NotificationEvent,
                               FALSE );
        }

        //
        //  Now set up the Irp->IoStatus.  It will be modified by the
        //  multi-completion routine in case of error or verify required.
        //

        Irp->IoStatus.Status = STATUS_SUCCESS;

        //
        // Loop while there are still byte writes to satisfy.
        //

        while (TRUE) {

            //
            //  We will continously try the I/O if we get a verify required
            //  back and can verify the volume.  Note that we could have ended
            //  on a hole, and have no runs left.
            //

            if (NumberRuns != 0) {

                while (TRUE) {

#ifdef PERF_STATS

                    if (TrackIos) {
                        TopLevelContext->ThreadIrpContext->Ios += 1;
                    }

                    if (CreateNewFile) {


                        InterlockedIncrement( &IrpContext->Vcb->IosPerCreates );
                        //KeQueryTickCount( &StartIo );
                        StartIo = KeQueryPerformanceCounter( NULL );
                    }
#endif

                    //
                    //  Do the I/O and wait for it to finish
                    //

                    NtfsMultipleAsync( IrpContext,
                                       Vcb->TargetDeviceObject,
                                       Irp,
                                       NumberRuns,
                                       CompressionContext.IoRuns,
                                       IrpSpFlags );

                    //
                    //  If this is an asynchronous transfer, then return STATUS_PENDING.
                    //

                    if (!Wait) {

                        DebugTrace( -1, Dbg, ("NtfsNonCachedIo -> STATUS_PENDING\n") );
                        try_return( Status = STATUS_PENDING );
                    }

                    NtfsWaitSync( IrpContext );

#ifdef PERF_STATS
                    if (CreateNewFile) {

//                        KeQueryTickCount( &Now );
                        Now = KeQueryPerformanceCounter( NULL );
                        IrpContext->Vcb->TimePerCreateIos += Now.QuadPart - StartIo.QuadPart;
                    }
#endif

#ifdef SYSCACHE_DEBUG
                        if (ScbIsBeingLogged( Scb ) && (IrpContext->MajorFunction == IRP_MJ_WRITE)) {
                            FsRtlLogSyscacheEvent( Scb, SCE_WRITE, SCE_FLAG_SUB_WRITE, CompressionContext.IoRuns[NumberRuns-1].StartingVbo, CompressionContext.IoRuns[NumberRuns-1].ByteCount, Irp->IoStatus.Status );
                        }
#endif
                        //
                        //  If we didn't get a verify required back then break out of
                        //  this loop
                        //

                    if (Irp->IoStatus.Status != STATUS_VERIFY_REQUIRED) { break; }

                    //
                    //  Otherwise we need to verify the volume, and if it doesn't
                    //  verify correctly the we dismount the volume and raise our
                    //  error
                    //

                    if (!NtfsPerformVerifyOperation( IrpContext, Vcb )) {

                        //**** NtfsPerformDismountOnVcb( IrpContext, Vcb, TRUE, NULL );
                        ClearFlag( Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED );

                        NtfsRaiseStatus( IrpContext, STATUS_FILE_INVALID, NULL, NULL );
                    }

                    //
                    //  The volume verified correctly so now clear the verify bit
                    //  and try and I/O again
                    //

                    ClearFlag( Vcb->Vpb->RealDevice->Flags, DO_VERIFY_VOLUME );

                    //
                    //  Reset the status before retrying.
                    //

                    Irp->IoStatus.Status = STATUS_SUCCESS;
                }

                //
                //  See if we need to do a hot fix.  Hotfix if the request failed
                //  (except if called from WriteClusters) or we couldn't revert
                //  a USA block.
                //

                if ((!FT_SUCCESS( Irp->IoStatus.Status ) &&
                     ((IrpContext->MajorFunction != IRP_MJ_WRITE) ||
                      (Irp == IrpContext->OriginatingIrp))) ||
                    (FlagOn(Scb->ScbState, SCB_STATE_USA_PRESENT) &&
                     (IrpContext->MajorFunction == IRP_MJ_READ) &&
                     !NtfsVerifyAndRevertUsaBlock( IrpContext,
                                                   Scb,
                                                   Irp,
                                                   NULL,
                                                   CompressionContext.IoRuns[0].BufferOffset,
                                                   OriginalByteCount -
                                                   CompressionContext.IoRuns[0].BufferOffset -
                                                   RemainingByteCount,
                                                   StartingVbo ))) {

                    //
                    //  Try to fix the problem
                    //

                    NtfsFixDataError( IrpContext,
                                      Scb,
                                      Vcb->TargetDeviceObject,
                                      Irp,
                                      NumberRuns,
                                      CompressionContext.IoRuns,
                                      IrpSpFlags );
                }
            }

            if (!NT_SUCCESS(Irp->IoStatus.Status) || (RemainingByteCount == 0)) { break; }

            if (CompressionContext.FinishBuffersNeeded) {

                Irp->IoStatus.Status =
                NtfsFinishBuffers( IrpContext,
                                   Irp,
                                   Scb,
                                   &StartingVbo,
                                   ByteCount - RemainingByteCount,
                                   NumberRuns,
                                   &CompressionContext,
                                   StreamFlags );

                if (!NT_SUCCESS(Irp->IoStatus.Status)) { break; }
            }

            StartingVbo = StartingVbo + (ByteCount - RemainingByteCount);
            CompressionContext.SystemBufferOffset += ByteCount - RemainingByteCount;

            ByteCount = RemainingByteCount;

            //
            //  Reset this boolean for each iteration.
            //

            CompressionContext.DataTransformed = FALSE;

            RemainingByteCount = NtfsPrepareBuffers( IrpContext,
                                                     Irp,
                                                     Scb,
                                                     &StartingVbo,
                                                     ByteCount,
                                                     StreamFlags,
                                                     &Wait,
                                                     &NumberRuns,
                                                     &CompressionContext );

            //
            //  If we are writing to an encrypted stream, now is the
            //  time to do the encryption, before we pass the buffer
            //  down to the disk driver below us.
            //
            if ((Scb->EncryptionContext != NULL) &&
                (IrpContext->MajorFunction == IRP_MJ_WRITE) &&
                (NtfsData.EncryptionCallBackTable.BeforeWriteProcess != NULL) &&
                (!FlagOn( StreamFlags, ENCRYPTED_STREAM ))) {

                ASSERT ( NtfsIsTypeCodeEncryptible( Scb->AttributeTypeCode ) );
                ASSERT( NumberRuns > 0 );

                NtfsEncryptBuffers( IrpContext,
                                    Irp,
                                    Scb,
                                    StartingVbo,
                                    NumberRuns,
                                    &CompressionContext );
            }

            ASSERT( RemainingByteCount < ByteCount );

            if (FlagOn(Irp->Flags, IRP_PAGING_IO)) {
                CollectDiskIoStats(Vcb, Scb, IrpContext->MajorFunction, NumberRuns);
            }
        }

        Status = Irp->IoStatus.Status;

    try_exit: NOTHING;

    } finally {

        //
        //  If this is a compressed file and we got success, go do our normal
        //  post processing.
        //

        if (CompressionContext.FinishBuffersNeeded &&
            NT_SUCCESS(Status) &&
            (Status != STATUS_PENDING) &&
            !AbnormalTermination() ) {

            Irp->IoStatus.Status =
            Status =
            NtfsFinishBuffers( IrpContext,
                               Irp,
                               Scb,
                               &StartingVbo,
                               ByteCount - RemainingByteCount,
                               NumberRuns,
                               &CompressionContext,
                               StreamFlags );
        }

        //
        //  For writes, free any Mdls which may have been used.
        //

        if (Mdl != NULL) {

            PMDL TempMdl;

            do {

                TempMdl = Mdl->Next;
                MmUnlockPages( Mdl );
                IoFreeMdl( Mdl );
                Mdl = TempMdl;

            } while (Mdl != NULL);
        }

        //
        //  Cleanup the compression context.
        //

        NtfsDeallocateCompressionBuffer( Irp, &CompressionContext, FALSE );
    }

    //
    //  Now set up the final byte count if we got success
    //

    if (Wait && NT_SUCCESS(Status)) {

        Irp->IoStatus.Information = OriginalByteCount;
    }

    DebugTrace( -1, Dbg, ("NtfsNonCachedIo -> %08lx\n", Status) );
    return Status;
}


VOID
NtfsNonCachedNonAlignedIo (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN ULONG ByteCount
    )

/*++

Routine Description:

    This routine performs the non-cached disk io described in its parameters.
    This routine differs from the above in that the range does not have to be
    sector aligned.  This accomplished with the use of intermediate buffers.
    devices where the sector is 1024 and callers have generated 512 byte aligned i/o.
    This accomplished with the use of intermediate buffers.

    Currently only read is supported.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Irp - Supplies the requesting Irp.

    Scb - Provides the stream to act on.

    StartingVbo - The starting point for the operation.

    ByteCount - The lengh of the operation.

Return Value:

    None.

--*/

{
    //
    // Declare some local variables for enumeration through the
    // runs of the file, and an array to store parameters for
    // parallel I/Os
    //

    LBO NextLbo;
    LCN NextLcn;
    ULONG NextLcnOffset;

    LONGLONG NextClusterCount;
    BOOLEAN NextIsAllocated;

    ULONG SectorOffset;
    ULONG SectorSize;
    ULONG BytesToCopy;
    ULONG OriginalByteCount;
    ULONG TailByteCount;
    VBO OriginalStartingVbo;

    PUCHAR UserBuffer;
    PUCHAR DiskBuffer = NULL;

    PMDL Mdl;
    PMDL SavedMdl;
    PVOID SavedUserBuffer;

    PVCB Vcb = Scb->Vcb;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsNonCachedNonAlignedRead\n") );
    DebugTrace( 0, Dbg, ("Irp                 = %08lx\n", Irp) );
    DebugTrace( 0, Dbg, ("MajorFunction       = %08lx\n", IrpContext->MajorFunction) );
    DebugTrace( 0, Dbg, ("Scb                 = %08lx\n", Scb) );
    DebugTrace( 0, Dbg, ("StartingVbo         = %016I64x\n", StartingVbo) );
    DebugTrace( 0, Dbg, ("ByteCount           = %08lx\n", ByteCount) );

    //
    //  Currently only read is supported.
    //

    ASSERT(IoGetCurrentIrpStackLocation(Irp)->MajorFunction != IRP_MJ_WRITE);

    //
    //  This code assumes the file is uncompressed.  Sparse files are supported.
    //  Before we assert that the file is uncompressed, assert that our test is
    //  going to be properly serialized.  We'll also be testing for the sparse
    //  attribute in the main code path, so we really need to be serialized here.
    //

    ASSERT( NtfsIsSharedScb( Scb ) ||
            ((Scb->Header.PagingIoResource != NULL) && NtfsIsSharedScbPagingIo( Scb )) );

    ASSERT( !FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK ) );

    //
    //  Initialize some locals.
    //

    OriginalByteCount = ByteCount;
    OriginalStartingVbo = StartingVbo;
    SectorSize = Vcb->BytesPerSector;

    //
    // For nonbuffered I/O, we need the buffer locked in all
    // cases.
    //
    // This call may raise.  If this call succeeds and a subsequent
    // condition is raised, the buffers are unlocked automatically
    // by the I/O system when the request is completed, via the
    // Irp->MdlAddress field.
    //

    NtfsLockUserBuffer( IrpContext,
                        Irp,
                        IoWriteAccess,
                        IoGetCurrentIrpStackLocation(Irp)->Parameters.Read.Length );

    UserBuffer = NtfsMapUserBuffer( Irp, NormalPagePriority );

    //
    //  Allocate the local buffer.  Round to pages to avoid any device alignment
    //  problems.
    //

    DiskBuffer = NtfsAllocatePool( NonPagedPool,
                                    (ULONG) ROUND_TO_PAGES( SectorSize ));

    //
    //  We use a try block here to ensure the buffer is freed, and to
    //  fill in the correct byte count in the Iosb.Information field.
    //

    try {

        //
        //  If the beginning of the request was not aligned correctly, read in
        //  the first part first.
        //

        SectorOffset = ((ULONG)StartingVbo) & (SectorSize - 1);

        if (SectorOffset != 0) {

            //
            //  Try to lookup the first run.
            //

            NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                    Scb,
                                                    Int64ShraMod32( StartingVbo, Vcb->ClusterShift ),
                                                    &NextLcn,
                                                    &NextClusterCount,
                                                    NULL,
                                                    NULL );

            //
            //  If this is a sparse file and we've been asked to read in a
            //  deallocated range, we need to fill in the buffer with some
            //  zeroes and there's nothing to really read from the disk.
            //  If this isn't a sparse file, and this range isn't allocated,
            //  the file and/or mcb is corrupt.
            //

            if (!NextIsAllocated) {

                if (FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_SPARSE )) {

                    RtlZeroMemory( DiskBuffer + SectorOffset,
                                   SectorSize - SectorOffset );
                } else {

                    NtfsRaiseStatus( IrpContext, STATUS_FILE_CORRUPT_ERROR, NULL, Scb->Fcb );
                }

            } else {

                //
                //  Adjust for any Lcn offset to the start of the sector we want.
                //

                NextLcnOffset = ((ULONG)StartingVbo) & ~(SectorSize - 1);
                NextLcnOffset &= Vcb->ClusterMask;
                NextLbo = Int64ShllMod32(NextLcn, Vcb->ClusterShift);
                NextLbo = NextLbo + NextLcnOffset;

                NtfsSingleNonAlignedSync( IrpContext,
                                          Vcb,
                                          Scb,
                                          DiskBuffer,
                                          StartingVbo + NextLcnOffset,
                                          NextLbo,
                                          SectorSize,
                                          Irp );

                if (!NT_SUCCESS( Irp->IoStatus.Status )) {

                    try_return( NOTHING );
                }
            }

            //
            //  Now copy the part of the first sector that we want to the user
            //  buffer.
            //

            BytesToCopy = (ByteCount >= SectorSize - SectorOffset
                           ? SectorSize - SectorOffset
                           : ByteCount);

            RtlCopyMemory( UserBuffer,
                           DiskBuffer + SectorOffset,
                           BytesToCopy );

            StartingVbo = StartingVbo + BytesToCopy;

            ByteCount -= BytesToCopy;

            if (ByteCount == 0) {

                try_return( NOTHING );
            }
        }

        ASSERT( (((ULONG)StartingVbo) & (SectorSize - 1)) == 0 );

        //
        //  If there is a tail part that is not sector aligned, read it.
        //

        TailByteCount = (ByteCount & (SectorSize - 1));

        if (TailByteCount != 0) {

            VBO LastSectorVbo;

            LastSectorVbo = BlockAlignTruncate( StartingVbo + ByteCount, (LONG)SectorSize );

            //
            //  Try to lookup the last part of the requested range.
            //

            NextIsAllocated = NtfsLookupAllocation( IrpContext,
                                                    Scb,
                                                    Int64ShraMod32( LastSectorVbo, Vcb->ClusterShift ),
                                                    &NextLcn,
                                                    &NextClusterCount,
                                                    NULL,
                                                    NULL );

            //
            //  If this is a sparse file and we've been asked to read in a
            //  deallocated range, we need to fill in the buffer with some
            //  zeroes and there's nothing to really read from the disk.
            //  If this isn't a sparse file, and this range isn't allocated,
            //  the file and/or mcb is corrupt.
            //

            if (!NextIsAllocated) {

                if (FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_SPARSE )) {

                    RtlZeroMemory( DiskBuffer,
                                   TailByteCount );
                } else {

                    NtfsRaiseStatus( IrpContext, STATUS_FILE_CORRUPT_ERROR, NULL, Scb->Fcb );
                }

            } else {

                //
                //  Adjust for any Lcn offset.
                //

                NextLcnOffset = ((ULONG)LastSectorVbo) & Vcb->ClusterMask;
                NextLbo = Int64ShllMod32(NextLcn, Vcb->ClusterShift);
                NextLbo = NextLbo + NextLcnOffset;

                NtfsSingleNonAlignedSync( IrpContext,
                                          Vcb,
                                          Scb,
                                          DiskBuffer,
                                          LastSectorVbo,
                                          NextLbo,
                                          SectorSize,
                                          Irp );

                if (!NT_SUCCESS( Irp->IoStatus.Status )) {

                    try_return( NOTHING );
                }
            }

            //
            //  Now copy over the part of this last sector that we need.
            //

            BytesToCopy = TailByteCount;

            UserBuffer += (ULONG)(LastSectorVbo - OriginalStartingVbo);

            RtlCopyMemory( UserBuffer, DiskBuffer, BytesToCopy );

            ByteCount -= BytesToCopy;

            if (ByteCount == 0) {

                try_return( NOTHING );
            }
        }

        ASSERT( ((((ULONG)StartingVbo) | ByteCount) & (SectorSize - 1)) == 0 );

        //
        //  Now build a Mdl describing the sector aligned balance of the transfer,
        //  and put it in the Irp, and read that part.
        //

        SavedMdl = Irp->MdlAddress;
        Irp->MdlAddress = NULL;

        SavedUserBuffer = Irp->UserBuffer;

        Irp->UserBuffer = (PUCHAR)MmGetMdlVirtualAddress( SavedMdl ) +
                          (ULONG)(StartingVbo - OriginalStartingVbo);


        Mdl = IoAllocateMdl(Irp->UserBuffer,
                            ByteCount,
                            FALSE,
                            FALSE,
                            Irp);

        if (Mdl == NULL) {

            Irp->MdlAddress = SavedMdl;
            Irp->UserBuffer = SavedUserBuffer;
            NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
        }

        IoBuildPartialMdl(SavedMdl,
                          Mdl,
                          Irp->UserBuffer,
                          ByteCount);

        //
        //  Try to read in the pages.
        //

        try {

            NtfsNonCachedIo( IrpContext,
                             Irp,
                             Scb,
                             StartingVbo,
                             ByteCount,
                             0 );

        } finally {

            IoFreeMdl( Irp->MdlAddress );

            Irp->MdlAddress = SavedMdl;
            Irp->UserBuffer = SavedUserBuffer;
        }

    try_exit: NOTHING;

    } finally {

        NtfsFreePool( DiskBuffer );

        if ( !AbnormalTermination() && NT_SUCCESS(Irp->IoStatus.Status) ) {

            Irp->IoStatus.Information = OriginalByteCount;

            //
            //  We now flush the user's buffer to memory.
            //

            KeFlushIoBuffers( Irp->MdlAddress, TRUE, FALSE );
        }
    }

    DebugTrace( -1, Dbg, ("NtfsNonCachedNonAlignedRead -> VOID\n") );
    return;
}


BOOLEAN
NtfsVerifyAndRevertUsaBlock (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB Scb,
    IN PIRP Irp OPTIONAL,
    IN PVOID SystemBuffer OPTIONAL,
    IN ULONG Offset,
    IN ULONG Length,
    IN LONGLONG FileOffset
    )

/*++

Routine Description:

    This routine will revert the bytes in all of the structures protected by
    update sequence arrays.  It copies the bytes from each Usa to the
    separate blocks protected.

    If a structure does not verify correctly, then it's signature is set
    to BaadSignature.

Arguments:

    Scb - The scb being read

    Irp - contain the buffer to be reverted if specified if not systembuffer should be

    SystemBuffer - contains the buffer if irp is null

    Offset - Offset within the buffer to be reverted

    Length - Length of the buffer to be reverted starting at the offset

    FileOffset - Offset within the file the buffer originates from

Return Value:

    FALSE - if at least one block did not verify correctly and received a BaadSignature
    TRUE - if no blocks received a BaadSignature

--*/

{
    PMULTI_SECTOR_HEADER MultiSectorHeader;
    PUSHORT SequenceArray;
    PUSHORT SequenceNumber;
    ULONG StructureSize;
    USHORT CountBlocks;
    PUSHORT ProtectedUshort;
    PVCB Vcb = Scb->Vcb;
    ULONG BytesLeft = Length;
    UCHAR Buffer[sizeof( MDL ) + sizeof( PFN_NUMBER ) * 2];
    PMDL PartialMdl = (PMDL) Buffer;

    BOOLEAN ReservedMapping = FALSE;
    BOOLEAN Result = TRUE;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsVerifyAndRevertUsaBlock:  Entered\n") );

    ASSERT( (ARGUMENT_PRESENT( Irp ) && !ARGUMENT_PRESENT( SystemBuffer )) ||
            (!ARGUMENT_PRESENT( Irp ) && ARGUMENT_PRESENT( SystemBuffer )) );

    //
    //  Cast the buffer pointer to a Multi-Sector-Header and verify that this
    //  block has been initialized.
    //

    if (ARGUMENT_PRESENT( Irp )) {
        SystemBuffer = NtfsMapUserBufferNoRaise( Irp, HighPagePriority );
    }

    //
    //  We can't map the user buffer due to low resources - so switch to using the reserved
    //  mapping instead
    //

    if (SystemBuffer == NULL) {

        ExAcquireFastMutexUnsafe( &Vcb->ReservedMappingMutex );
        ReservedMapping = TRUE;

        MmInitializeMdl( PartialMdl, NULL, 2 * PAGE_SIZE );
        IoBuildPartialMdl( Irp->MdlAddress, PartialMdl, Add2Ptr( MmGetMdlBaseVa( Irp->MdlAddress ), MmGetMdlByteOffset( Irp->MdlAddress ) + Offset ), Vcb->BytesPerSector );
        MultiSectorHeader = (PMULTI_SECTOR_HEADER) MmMapLockedPagesWithReservedMapping( IrpContext->Vcb->ReservedMapping,
                                                                                        RESERVE_POOL_TAG,
                                                                                        PartialMdl,
                                                                                        MmCached );
        ASSERT( MultiSectorHeader != NULL );
    } else {
        MultiSectorHeader = (PMULTI_SECTOR_HEADER)Add2Ptr( SystemBuffer, Offset );
    }


    //
    //  Get the the number of blocks, based on what type of stream it is.
    //  First check for Mft or Log file.
    //

    if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_MFT) {

        ASSERT((Scb == Vcb->MftScb) || (Scb == Vcb->Mft2Scb));

        StructureSize = Vcb->BytesPerFileRecordSegment;

    } else if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_DATA) {

        ASSERT( Scb == Vcb->LogFileScb );

        //
        //  On the first pass through the log file, we see all -1,
        //  and we just want to let it go.
        //

        if (*(PULONG)&MultiSectorHeader->Signature == MAXULONG) {

            //
            //  Use the structure size given us by Lfs if present.
            //

            StructureSize = Vcb->LfsWriteData.LfsStructureSize;

        //
        //  Use the current size in the USA
        //

        } else {

            CountBlocks = (USHORT)(MultiSectorHeader->UpdateSequenceArraySize - 1);
            StructureSize = CountBlocks * SEQUENCE_NUMBER_STRIDE;

            //
            //  Check for plausibility and otherwise use page size.
            //

            if ((StructureSize != 0x1000)  && (StructureSize != 0x2000) && (StructureSize != PAGE_SIZE)) {

                StructureSize = PAGE_SIZE;
            }
        }

    //
    //  Otherwise it is an index, so we can get the count out of the Scb.
    //

    } else {

        StructureSize = Scb->ScbType.Index.BytesPerIndexBuffer;

        ASSERT((StructureSize == 0x800) || (StructureSize == 0x1000) || (StructureSize == 0x400));
        ASSERT((Length & (StructureSize - 1)) == 0);
    }

    //
    //  We're done with the mapped buffer so release the reserved mapping if we used them
    //

    if (ReservedMapping) {
        MmUnmapReservedMapping( Vcb->ReservedMapping, RESERVE_POOL_TAG, PartialMdl );
        MmPrepareMdlForReuse( PartialMdl );
        ExReleaseFastMutexUnsafe( &Vcb->ReservedMappingMutex );
        ReservedMapping = FALSE;
        MultiSectorHeader = NULL;
    }

    if (StructureSize == 0) {

        ASSERT( Scb == Vcb->LogFileScb );

        DebugTrace( -1, Dbg, ("NtfsVerifyAndRevertUsaBlock: (Virgin Log)\n") );
        return TRUE;
    }

    ASSERTMSG( "ReservedMapping should be large enough for this structure\n", StructureSize < 2 * PAGE_SIZE );

    CountBlocks = (USHORT)(StructureSize / SEQUENCE_NUMBER_STRIDE);

    //
    //  Loop through all of the multi-sector blocks in this transfer.
    //

    do {

        //
        //  First find our location in the MultiSectorHeader - use reserve mappings
        //  if we haven't got a system buffer
        //

        if (!SystemBuffer) {

            ExAcquireFastMutexUnsafe( &Vcb->ReservedMappingMutex );
            ReservedMapping = TRUE;

            IoBuildPartialMdl( Irp->MdlAddress,
                               PartialMdl,
                               Add2Ptr( MmGetMdlBaseVa( Irp->MdlAddress ), MmGetMdlByteOffset( Irp->MdlAddress ) + Offset + Length - BytesLeft),
                               StructureSize );
            MultiSectorHeader = (PMULTI_SECTOR_HEADER) MmMapLockedPagesWithReservedMapping( IrpContext->Vcb->ReservedMapping,
                                                                                            RESERVE_POOL_TAG,
                                                                                            PartialMdl,
                                                                                            MmCached );

            ASSERT( MultiSectorHeader != NULL );
        } else {
            MultiSectorHeader = (PMULTI_SECTOR_HEADER)Add2Ptr( SystemBuffer, Offset + Length - BytesLeft );
        }

        //
        //  Uninitialized log file pages always must contain MAXULONG, which is
        //  not a valid signature.  Do not do the check if we see MAXULONG.  Also
        //  since we may have read random uninitialized data, we must check every
        //  possible field that could cause us to fault or go outside of the block,
        //  and also not check in this case.
        //

        //
        //  For 0 or MAXULONG we assume the value is "expected", and we do not
        //  want to replace with the BaadSignature, just move on.
        //

        if ((*(PULONG)&MultiSectorHeader->Signature == MAXULONG) ||
            (*(PULONG)&MultiSectorHeader->Signature == 0)) {

            NOTHING;

        } else if ((CountBlocks == (USHORT)(MultiSectorHeader->UpdateSequenceArraySize - 1)) &&
                   !FlagOn(MultiSectorHeader->UpdateSequenceArrayOffset, 1) &&
                   (MultiSectorHeader->UpdateSequenceArrayOffset >= sizeof( MULTI_SECTOR_HEADER )) &&
                   (MultiSectorHeader->UpdateSequenceArrayOffset < SEQUENCE_NUMBER_STRIDE) &&
                   (StructureSize <= BytesLeft)) {

            ULONG CountToGo;

            CountToGo = CountBlocks;

            //
            //  Compute the array offset and recover the current sequence number.
            //

            SequenceNumber = (PUSHORT)Add2Ptr( MultiSectorHeader,
                                               MultiSectorHeader->UpdateSequenceArrayOffset );

            SequenceArray = SequenceNumber + 1;

            //
            //  We now walk through each block, and insure that the last byte in each
            //  block matches the sequence number.
            //

            ProtectedUshort = (PUSHORT) (Add2Ptr( MultiSectorHeader,
                                                  SEQUENCE_NUMBER_STRIDE - sizeof( USHORT )));

            //
            //  Loop to test for the correct sequence numbers and restore the
            //  sequence numbers.
            //

            do {

                //
                //  If the sequence number does not check, then raise if the record
                //  is not allocated.  If we do not raise, i.e. the routine returns,
                //  then smash the signature so we can easily tell the record is not
                //  allocated.
                //

                if (*ProtectedUshort != *SequenceNumber) {

                    //
                    //  We do nothing except exit if this is the log file and
                    //  the signature is the chkdsk signature.
                    //

                    if ((Scb != Vcb->LogFileScb) ||
                        (*(PULONG)MultiSectorHeader->Signature != *(PULONG)ChkdskSignature)) {

                        //
                        //  If this is the Mft or an index buffer and all of the data for this file
                        //  record is contained in the verified range of the
                        //  record then don't mark it bad.
                        //

                        if ((Scb == Vcb->MftScb) || (Scb == Vcb->Mft2Scb)) {

                            PFILE_RECORD_SEGMENT_HEADER FileRecord;

                            FileRecord = (PFILE_RECORD_SEGMENT_HEADER) MultiSectorHeader;

                            if (FileRecord->FirstFreeByte < ((CountBlocks - CountToGo) * SEQUENCE_NUMBER_STRIDE)) {

                                continue;
                            }
                        } else if (*(PULONG)MultiSectorHeader->Signature == *(PULONG)IndexSignature ) {

                            PINDEX_ALLOCATION_BUFFER IndexBuffer;

                            IndexBuffer = (PINDEX_ALLOCATION_BUFFER) MultiSectorHeader;

                            if (IndexBuffer->IndexHeader.FirstFreeByte < ((CountBlocks - CountToGo) * SEQUENCE_NUMBER_STRIDE)) {

                                continue;
                            }
                        }

                        *(PULONG)MultiSectorHeader->Signature = *(PULONG)BaadSignature;
                        Result = FALSE;
                    }

                    break;

                } else {

                    *ProtectedUshort = *SequenceArray++;
                }

                ProtectedUshort += (SEQUENCE_NUMBER_STRIDE / sizeof( USHORT ));

            } while (--CountToGo != 0);

        //
        //  If this is the log file, we report an error unless the current
        //  signature is the chkdsk signature.
        //

        } else if (Scb == Vcb->LogFileScb) {

            if (*(PULONG)MultiSectorHeader->Signature != *(PULONG)ChkdskSignature) {

                *(PULONG)MultiSectorHeader->Signature = *(PULONG)BaadSignature;
                Result = FALSE;
            }

        } else {

            VCN Vcn;
            LCN Lcn;
            LONGLONG ClusterCount;
            BOOLEAN IsAllocated;

            Vcn = LlClustersFromBytesTruncate( Vcb, FileOffset );

            //
            //  Release the reserved buffer before calling
            //

            if (ReservedMapping) {
                MmUnmapReservedMapping( Vcb->ReservedMapping, RESERVE_POOL_TAG, PartialMdl );
                MmPrepareMdlForReuse( PartialMdl );
                ExReleaseFastMutexUnsafe( &Vcb->ReservedMappingMutex );
                MultiSectorHeader = NULL;
                ReservedMapping = FALSE;
            }

            IsAllocated = NtfsLookupAllocation( IrpContext,
                                                Scb,
                                                Vcn,
                                                &Lcn,
                                                &ClusterCount,
                                                NULL,
                                                NULL );

            if (!SystemBuffer) {

                ExAcquireFastMutexUnsafe( &Vcb->ReservedMappingMutex );
                ReservedMapping = TRUE;

                IoBuildPartialMdl( Irp->MdlAddress, PartialMdl, Add2Ptr( MmGetMdlBaseVa( Irp->MdlAddress ), MmGetMdlByteOffset( Irp->MdlAddress ) + Offset + Length - BytesLeft), StructureSize );
                MultiSectorHeader = MmMapLockedPagesWithReservedMapping( IrpContext->Vcb->ReservedMapping,
                                                              RESERVE_POOL_TAG,
                                                              PartialMdl,
                                                              MmCached );

                ASSERT( MultiSectorHeader != NULL );
            }


            if (!IsAllocated &&
                ( ClusterCount >= LlClustersFromBytes( Vcb, StructureSize))) {

                *(PULONG)MultiSectorHeader->Signature = *(PULONG)HoleSignature;
            } else {
                *(PULONG)MultiSectorHeader->Signature = *(PULONG)BaadSignature;
                Result = FALSE;
            }
        }

        //
        //  Release the reserve mapping before looping
        //

        if (ReservedMapping) {
            MmUnmapReservedMapping( Vcb->ReservedMapping, RESERVE_POOL_TAG, PartialMdl );
            MmPrepareMdlForReuse( PartialMdl );
            ExReleaseFastMutexUnsafe( &Vcb->ReservedMappingMutex );
            MultiSectorHeader = NULL;
            ReservedMapping = FALSE;
        }

        if (BytesLeft > StructureSize) {
            BytesLeft -= StructureSize;
        } else {
            BytesLeft = 0;
        }
        FileOffset = FileOffset + StructureSize;

    } while (BytesLeft != 0);

    DebugTrace( -1, Dbg, ("NtfsVerifyAndRevertUsaBlock:  Exit\n") );
    return Result;
}


VOID
NtfsTransformUsaBlock (
    IN PSCB Scb,
    IN OUT PVOID SystemBuffer,
    IN OUT PVOID Buffer,
    IN ULONG Length
    )

/*++

Routine Description:

    This routine will implement Usa protection for all structures of the
    transfer passed described by the caller.  It does so by copying the last
    short in each block of each Usa-protected structure to the
    Usa and storing the current sequence number into each of these bytes.

    It also increments the sequence number in the Usa.

Arguments:

    Buffer - This is the pointer to the start of the structure to transform.

    Length - This is the maximum size for the structure.

Return Value:

    ULONG - This is the length of the transformed structure.

--*/

{
    PMULTI_SECTOR_HEADER MultiSectorHeader;
    PUSHORT SequenceArray;
    PUSHORT SequenceNumber;
    ULONG StructureSize;
    USHORT CountBlocks;
    PUSHORT ProtectedUshort;
    PVCB Vcb = Scb->Vcb;
    ULONG BytesLeft = Length;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsTransformUsaBlock:  Entered\n") );

    //
    //  Cast the buffer pointer to a Multi-Sector-Header and verify that this
    //  block has been initialized.
    //

    MultiSectorHeader = (PMULTI_SECTOR_HEADER) Buffer;

    //
    //  Get the the number of blocks, based on what type of stream it is.
    //  First check for Mft or Log file.
    //

    if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_MFT) {

        ASSERT((Scb == Vcb->MftScb) || (Scb == Vcb->Mft2Scb));

        StructureSize = Vcb->BytesPerFileRecordSegment;

    } else if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_DATA) {

        //
        //  For the log file, use the value that Lfs has stored in the
        //  Lfs WRITE_DATA structure.
        //

        ASSERT( Scb == Vcb->LogFileScb );
        ASSERT( Vcb->LfsWriteData.LfsStructureSize != 0 );

        StructureSize = Vcb->LfsWriteData.LfsStructureSize;

    //
    //  Otherwise it is an index, so we can get the count out of the Scb.
    //

    } else {

        StructureSize = Scb->ScbType.Index.BytesPerIndexBuffer;

        ASSERT((StructureSize == 0x800) || (StructureSize == 0x1000) || (StructureSize == 0x400));
        ASSERT((Length & (StructureSize - 1)) == 0);
    }

    CountBlocks = (USHORT)(StructureSize / SEQUENCE_NUMBER_STRIDE);

    //
    //  Loop through all of the multi-sector blocks in this transfer.
    //

    do {

        //
        //  Any uninitialized structures will begin with BaadSignature or
        //  MAXULONG, as guaranteed by the Revert routine above.
        //

        if ((*(PULONG)&MultiSectorHeader->Signature != *(PULONG)BaadSignature) &&
            (*(PULONG)&MultiSectorHeader->Signature != *(PULONG)HoleSignature) &&
            (*(PULONG)&MultiSectorHeader->Signature != MAXULONG) &&
            ((MultiSectorHeader->UpdateSequenceArrayOffset & 1) == 0) &&
            (MultiSectorHeader->UpdateSequenceArrayOffset >= sizeof( MULTI_SECTOR_HEADER )) &&
            (MultiSectorHeader->UpdateSequenceArrayOffset < SEQUENCE_NUMBER_STRIDE)) {

            ULONG CountToGo = CountBlocks;

            //
            //  Compute the array offset and recover the current sequence number.
            //

            SequenceNumber = (PUSHORT)Add2Ptr( MultiSectorHeader,
                                               MultiSectorHeader->UpdateSequenceArrayOffset );

            //
            //  Increment sequence number before the write, both in the buffer
            //  going out and in the original buffer pointed to by SystemBuffer.
            //  Skip numbers with all 0's and all 1's because 0's are produced by
            //  by common failure cases and -1 is used by hot fix.
            //

            do {

                *SequenceNumber += 1;

                *(PUSHORT)Add2Ptr( SystemBuffer,
                                   MultiSectorHeader->UpdateSequenceArrayOffset ) += 1;

            } while ((*SequenceNumber == 0) || (*SequenceNumber == 0xFFFF));

            SequenceArray = SequenceNumber + 1;

            //
            //  We now walk through each block to copy each protected short
            //  to the sequence array, and replacing it by the incremented
            //  sequence number.
            //

            ProtectedUshort = (PUSHORT) (Add2Ptr( MultiSectorHeader,
                                                  SEQUENCE_NUMBER_STRIDE - sizeof( USHORT )));

            //
            //  Loop to test for the correct sequence numbers and restore the
            //  sequence numbers.
            //

            do {

                *SequenceArray++ = *ProtectedUshort;
                *ProtectedUshort = *SequenceNumber;

                ProtectedUshort += (SEQUENCE_NUMBER_STRIDE / sizeof( USHORT ));

            } while (--CountToGo != 0);
        }

        //
        //  Now adjust all pointers and counts before looping back.
        //

        MultiSectorHeader = (PMULTI_SECTOR_HEADER)Add2Ptr( MultiSectorHeader,
                                                           StructureSize );
        SystemBuffer = Add2Ptr( SystemBuffer, StructureSize );
        BytesLeft -= StructureSize;

    } while (BytesLeft != 0);

    DebugTrace( -1, Dbg, ("NtfsTransformUsaBlock:  Exit -> %08lx\n", StructureSize) );
    return;
}


VOID
NtfsCreateMdlAndBuffer (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB ThisScb,
    IN UCHAR NeedTwoBuffers,
    IN OUT PULONG Length,
    OUT PMDL *Mdl OPTIONAL,
    OUT PVOID *Buffer
    )

/*++

Routine Description:

    This routine will allocate a buffer and create an Mdl which describes
    it.  This buffer and Mdl can then be used for an I/O operation, the
    pages will be locked in memory.

    This routine is intended to be used for cases where large I/Os are
    required.  It attempts to avoid allocations errors and bugchecks by
    using a reserved buffer scheme.  In order for this scheme to work without
    deadlocks, the calling thread must have all resources acquired that it
    will need prior to doing the I/O.  I.e., this routine itself may acquire
    a resource which must work as an end resource.

    Examples of callers to this routine are noncached writes to USA streams,
    and noncached reads and writes to compressed streams.  One case to be
    aware of is the case where a noncached compressed write needs to fault
    in the rest of a compression unit, in order to write the entire unit.
    In an extreme case the noncached writer will allocated one reserved buffer,
    and the noncached read of the rest of the compression unit may need to
    recursively acquire the resource in this routine and allocate the other
    reserved buffer.

Arguments:

    ThisScb - Scb for the file where the IO is occurring.

    NeedTwoBuffers - Indicates that this is the request for the a buffer for
        a transaction which may need two buffers.  A value of RESERVED_BUFFER_ONE_NEEDED means only 1
        buffer is needed.  A value of RESERVED_BUFFER_TWO_NEEDED or RESERVED_BUFFER_WORKSPACE_NEEDED
        indicates that we need two buffers and either ReservedBuffer1 or ReservedBuffer2 should be acquired.

    Length - This is the length needed for this buffer, returns (possibly larger)
        length allocated.

    Mdl - This is the address to store the address of the Mdl created.

    Buffer - This is the address to store the address of the buffer allocated.

Return Value:

    None.

--*/

{
    PVOID TempBuffer;
    PMDL TempMdl;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsCreateMdlAndBuffer:  Entered\n") );

    ASSERT( (NeedTwoBuffers == RESERVED_BUFFER_WORKSPACE_NEEDED) ?
            (*Length <= WORKSPACE_BUFFER_SIZE) :
            (*Length <= LARGE_BUFFER_SIZE) );

    TempBuffer = NULL;
    TempMdl = NULL;

    //
    //  If this thread already owns a buffer then call to get the second.
    //
    //  If there have been no allocation failures recently, and
    //  we can use at least half of a big buffer, then go for
    //  one of our preallocated buffers first.
    //

    if ((NtfsReservedBufferThread == (PVOID) PsGetCurrentThread()) ||
        ((*Length >= LARGE_BUFFER_SIZE / 2) && !NtfsBufferAllocationFailure)) {

        //
        //  If we didn't get one then try from pool.
        //

        if (!NtfsGetReservedBuffer( ThisScb->Fcb, &TempBuffer, Length, NeedTwoBuffers )) {

            TempBuffer = NtfsAllocatePoolWithTagNoRaise( NonPagedPoolCacheAligned, *Length, '9ftN' );
        }

    //
    //  Otherwise try to allocate from pool and then get a reserved buffer if
    //  there have been no allocation errors recently.
    //

    } else {

        TempBuffer = NtfsAllocatePoolWithTagNoRaise( NonPagedPoolCacheAligned, *Length, '9ftN' );

        if ((TempBuffer == NULL) && !NtfsBufferAllocationFailure) {

            NtfsGetReservedBuffer( ThisScb->Fcb, &TempBuffer, Length, NeedTwoBuffers );
        }
    }

    //
    //  If we could not allocate a buffer from pool, then
    //  we must stake our claim to a reserved buffer.
    //
    //  We would like to queue the requests which need a single buffer because
    //  they won't be completely blocked by the owner of multiple buffers.
    //  But if this thread wants multiple buffers and there is already a
    //  thread with multiple buffers then fail this request with FILE_LOCK_CONFLICT
    //  in case the current thread is holding some resource needed by the
    //  existing owner.
    //

    if (TempBuffer == NULL) {

        ExAcquireResourceExclusiveLite( &NtfsReservedBufferResource, TRUE );

        //
        //  Show that we have gotten an allocation failure
        //

        NtfsBufferAllocationFailure = TRUE;

        //
        //  Loop here until we get a buffer or abort the current request.
        //

        while (TRUE) {

            KeDelayExecutionThread( KernelMode, FALSE, &NtfsShortDelay );

            if (NtfsGetReservedBuffer( ThisScb->Fcb, &TempBuffer, Length, NeedTwoBuffers )) {

                if (ExGetExclusiveWaiterCount( &NtfsReservedBufferResource ) == 0) {

                    NtfsBufferAllocationFailure = FALSE;
                }

                ExReleaseResourceLite( &NtfsReservedBufferResource );
                break;
            }

            //
            //  We will perform some deadlock detection here and raise
            //  STATUS_FILE_LOCK conflict in order to retry this request if
            //  anyone is queued behind the resource.  Deadlocks can occur
            //  under the following circumstances when another thread is
            //  blocked behind this resource:
            //
            //      - Current thread needs two buffers.  We can't block the
            //          Needs1 guy which may need to complete before the
            //          current Needs2 guy can proceed.  Exception is case
            //          where current thread already has a buffer and we
            //          have a recursive 2 buffer case.  In this case we
            //          are only waiting for the third buffer to become
            //          available.
            //
            //      - Current thread is the lazy writer.  Lazy writer will
            //          need buffer for USA transform.  He also can own
            //          the BCB resource that might be needed by the current
            //          owner of a buffer.
            //
            //      - Current thread is operating on the same Fcb as the owner
            //          of any of the buffers.
            //

            //
            //  If the current thread already owns one of the two buffers then
            //  always allow him to loop.  Otherwise perform deadlock detection
            //  if we need 2 buffers or this this is the lazy writer or we
            //  are trying to get the same Fcb already owned by the 2 buffer guy.
            //

            if ((PsGetCurrentThread() != NtfsReservedBufferThread) &&

                (NeedTwoBuffers ||

#ifdef COMPRESS_ON_WIRE
                (ThisScb->LazyWriteThread[0] == PsGetCurrentThread()) ||
                (ThisScb->LazyWriteThread[1] == PsGetCurrentThread()) ||
#else
                (NtfsGetTopLevelContext()->SavedTopLevelIrp == (PIRP)FSRTL_CACHE_TOP_LEVEL_IRP) ||

#endif
                (ThisScb->Fcb == NtfsReserved12Fcb))) {

                //
                //  If no one is waiting then see if we can continue waiting.
                //

                if (ExGetExclusiveWaiterCount( &NtfsReservedBufferResource ) == 0) {

                    //
                    //  If there is no one waiting behind us and there is no current
                    //  multi-buffer owner, then try again here.
                    //

                    if (NtfsReservedBufferThread == NULL) {

                        continue;
                    }

                    NtfsBufferAllocationFailure = FALSE;
                }

                ExReleaseResourceLite( &NtfsReservedBufferResource );

                NtfsRaiseStatus( IrpContext, STATUS_FILE_LOCK_CONFLICT, NULL, NULL );
            }
        }
    }

    //
    //  Use a try-finally to facilitate cleanup.
    //

    try {

        if (ARGUMENT_PRESENT(Mdl)) {

            //
            //  Allocate an Mdl for this buffer.
            //

            TempMdl = IoAllocateMdl( TempBuffer,
                                     *Length,
                                     FALSE,
                                     FALSE,
                                     NULL );

            if (TempMdl == NULL) {

                NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
            }

            //
            //  Lock the new Mdl in memory.
            //

            MmBuildMdlForNonPagedPool( TempMdl );
            *Mdl = TempMdl;
        }

    } finally {

        DebugUnwind( NtfsCreateMdlAndBuffer );

        //
        //  If abnormal termination, back out anything we've done.
        //

        if (AbnormalTermination()) {

            NtfsDeleteMdlAndBuffer( TempMdl, TempBuffer );

        //
        //  Otherwise, give the Mdl and buffer to the caller.
        //

        } else {

            *Buffer = TempBuffer;
        }

        DebugTrace( -1, Dbg, ("NtfsCreateMdlAndBuffer:  Exit\n") );
    }

    return;
}


VOID
NtfsDeleteMdlAndBuffer (
    IN PMDL Mdl OPTIONAL,
    IN PVOID Buffer OPTIONAL
    )

/*++

Routine Description:

    This routine will allocate a buffer and create an Mdl which describes
    it.  This buffer and Mdl can then be used for an I/O operation, the
    pages will be locked in memory.

Arguments:

    Mdl - Address of Mdl to free

    Buffer - This is the address to store the address of the buffer allocated.

Return Value:

    None.

--*/

{
    //
    //  Free Mdl if there is one
    //

    if (Mdl != NULL) {
        IoFreeMdl( Mdl );
    }

    //
    //  Free reserved buffer or pool
    //

    if (Buffer != NULL) {

        if (!NtfsFreeReservedBuffer( Buffer )) {

            NtfsFreePool( Buffer );
        }
    }
}


PMDL
NtfsBuildZeroMdl (
    IN PIRP_CONTEXT IrpContext,
    IN ULONG Length,
    OUT PVOID *Buffer
    )
/*++

Routine Description:

    Create an efficient mdl that describe a given length of zeros. We'll only
    allocate a one page buffer and make a mdl that maps all the pages back to the single
    physical page. We'll default to a smaller size buffer down to 1 PAGE if memory
    is tight. The caller should check the Mdl->ByteCount to see the true size

Arguments:

    Length - The desired length of the zero buffer. We may return less than this

    Buffer - This returns the nonpaged pool buffer we've allocated - the caller
        should free it after he frees the returned MDL.

Return Value:

    a MDL if successfull / NULL if not

--*/

{
    PMDL ZeroMdl;
    ULONG SavedByteCount;
    PPFN_NUMBER Page;
    ULONG i;

    *Buffer = (PCHAR) NtfsAllocatePoolNoRaise( NonPagedPool, PAGE_SIZE );
    if (!*Buffer) {
        return NULL;
    }
    RtlZeroMemory( *Buffer, PAGE_SIZE );

    while (TRUE) {

        //
        //  Spin down trying to get an MDL which can describe our operation.
        //

        while (TRUE) {

            ZeroMdl = IoAllocateMdl( *Buffer, Length, FALSE, FALSE, NULL );

            //
            //  Throttle ourselves to what we've physically allocated.  Note that
            //  we could have started with an odd multiple of this number.  If we
            //  tried for exactly that size and failed, we're toast.
            //

            if (ZeroMdl || (Length <= PAGE_SIZE)) {

                break;
            }

            //
            //  Fallback by half and round down to a sector multiple.
            //

            Length = BlockAlignTruncate( Length / 2, (LONG)IrpContext->Vcb->BytesPerSector );
            if (Length < PAGE_SIZE) {
                Length = PAGE_SIZE;
            }
        }

        if (ZeroMdl == NULL) {
            NtfsFreePool( *Buffer );
            *Buffer = NULL;
            return NULL;
        }

        //
        //  If we have throttled all the way down, stop and just build a
        //  simple MDL describing our previous allocation.
        //

        if (Length == PAGE_SIZE) {

            MmBuildMdlForNonPagedPool( ZeroMdl );
            break;
        }

        //
        //  Now we will temporarily lock the allocated pages
        //  only, and then replicate the page frame numbers through
        //  the entire Mdl to keep writing the same pages of zeros.
        //
        //  It would be nice if Mm exported a way for us to not have
        //  to pull the Mdl apart and rebuild it ourselves, but this
        //  is so bizzare a purpose as to be tolerable.
        //

        SavedByteCount = ZeroMdl->ByteCount;
        ZeroMdl->ByteCount = PAGE_SIZE;
        MmBuildMdlForNonPagedPool( ZeroMdl );

        ZeroMdl->MdlFlags &= ~MDL_SOURCE_IS_NONPAGED_POOL;
        ZeroMdl->MdlFlags |= MDL_PAGES_LOCKED;
        ZeroMdl->MappedSystemVa = NULL;
        ZeroMdl->ByteCount = SavedByteCount;
        Page = MmGetMdlPfnArray( ZeroMdl );
        for (i = 0; i < (ADDRESS_AND_SIZE_TO_SPAN_PAGES( 0, SavedByteCount )); i++) {
            *(Page + i) = *(Page);
        }
        break;
    }

    return ZeroMdl;
}


VOID
NtfsWriteClusters (
    IN PIRP_CONTEXT IrpContext,
    IN PVCB Vcb,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN PVOID Buffer,
    IN ULONG ClusterCount
    )

/*++

Routine Description:

    This routine is called to write clusters directly to a file.  It is
    needed when converting a resident attribute to non-resident when
    we can't initialize through the cache manager.  This happens when
    we receive a SetEndOfFile from MM when creating a section for
    a resident file.

Arguments:

    Vcb - Vcb for this device.

    StartingVbo - This is the starting offset to write to.

    Buffer - Buffer containing the data to write.

    ClusterCount - This is the number of clusters to write.

Return Value:

    None.  This routine will raise if the operation is unsuccessful.

--*/

{
    PIRP NewIrp = NULL;
    UCHAR MajorFunction = IrpContext->MajorFunction;
    BOOLEAN LockedUserBuffer = FALSE;
    PNTFS_IO_CONTEXT PreviousContext;
    ULONG State;
    ULONG ByteCount = BytesFromClusters( Vcb, ClusterCount );
    ULONG OriginalByteCount = ByteCount;
    PMDL Mdl = NULL;

    NTFS_IO_CONTEXT LocalContext;

    BOOLEAN ZeroBuffer = FALSE;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsWriteClusters:  Entered\n") );
    DebugTrace( 0, Dbg, ("StartingVbo   -> %016I64x\n", StartingVbo) );
    DebugTrace( 0, Dbg, ("Buffer        -> %08lx\n", Buffer) );
    DebugTrace( 0, Dbg, ("ClusterCount  -> %08lx\n", ClusterCount) );

    //
    //  Force this operation to be synchronous.
    //

    SetFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

    //
    //  Swap out the old Io context block.
    //

    PreviousContext = IrpContext->Union.NtfsIoContext;

    IrpContext->Union.NtfsIoContext = &LocalContext;
    State = IrpContext->State;
    ClearFlag( IrpContext->State, IRP_CONTEXT_STATE_ALLOC_IO_CONTEXT );

    //
    //  Use a try-finally so we can clean up properly.
    //

    try {

        PIO_STACK_LOCATION IrpSp;

        RtlZeroMemory( IrpContext->Union.NtfsIoContext, sizeof( NTFS_IO_CONTEXT ));
        KeInitializeEvent( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent,
                           NotificationEvent,
                           FALSE );

        NewIrp = IoBuildAsynchronousFsdRequest( IRP_MJ_WRITE,
                                                Vcb->Vpb->DeviceObject,
                                                Buffer,
                                                ByteCount,
                                                (PLARGE_INTEGER)&StartingVbo,
                                                NULL );

        if (NewIrp == NULL) {

            NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
        }

        //
        //  We now have an Irp, we want to make it look as though it is part of
        //  the current call.  We need to adjust the Irp stack to update this.
        //

        IoSetNextIrpStackLocation( NewIrp );

        //
        //  Check if we're writing zeros
        //

        if (Buffer == NULL) {

            //
            //  This won't work for compression or encryption because they manipulate
            //  the input buffer
            //

            ASSERT( !FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK | ATTRIBUTE_FLAG_ENCRYPTED ) );

            Mdl = NtfsBuildZeroMdl( IrpContext, ByteCount, &Buffer );
            if (!Mdl) {
                NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
            }
            ZeroBuffer = TRUE;
        }

        //
        //  Loop and do the write in chunks
        //

        while (ByteCount != 0) {

            ULONG Size = ByteCount;

            if (!ZeroBuffer) {

                //
                //  Attempt to allocate a mdl - reducing the size if we fail until
                //  we're at a page size
                //

                do {

                    Mdl = IoAllocateMdl( Add2Ptr( Buffer, OriginalByteCount - ByteCount ), Size, FALSE, FALSE, NULL );

                    if (Mdl == NULL) {
                        Size = BlockAlignTruncate( Size / 2, (LONG)Vcb->BytesPerSector );
                    }
                } while ((Mdl == NULL) && (Size >= PAGE_SIZE));

                if (!Mdl) {
                    NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
                }
                //
                //  Now probe the buffer described by the Irp.  If we get an exception,
                //  deallocate the Mdl and return the appropriate "expected" status.
                //

                try {

                    MmProbeAndLockPages( Mdl, NewIrp->RequestorMode, IoReadAccess );

                } except(EXCEPTION_EXECUTE_HANDLER) {

                    NTSTATUS Status;

                    Status = GetExceptionCode();

                    IoFreeMdl( Mdl );
                    NtfsRaiseStatus( IrpContext,
                                     FsRtlIsNtstatusExpected(Status) ? Status : STATUS_INVALID_USER_BUFFER,
                                     NULL,
                                     NULL );
                }

                LockedUserBuffer = TRUE;
            } else {

                Size = min( ByteCount, Mdl->ByteCount );
            }

            //
            //  Put our buffer/Mdl into the Irp and update the offset and length
            //

            if (!ZeroBuffer) {
                NewIrp->UserBuffer = Add2Ptr( Buffer, OriginalByteCount - ByteCount );
            }
            NewIrp->MdlAddress = Mdl;

            IrpSp = IoGetCurrentIrpStackLocation( NewIrp );
            IrpSp->DeviceObject = Vcb->Vpb->DeviceObject;
            IrpSp->Parameters.Write.Length = Size;
            IrpSp->Parameters.Write.ByteOffset.QuadPart = StartingVbo;

            //
            //  Put the write code into the IrpContext.
            //

            IrpContext->MajorFunction = IRP_MJ_WRITE;

            //
            //  Write the data to the disk.
            //

            NtfsNonCachedIo( IrpContext,
                             NewIrp,
                             Scb,
                             StartingVbo,
                             Size,
                             0 );

            //
            //  If we encountered an error or didn't write all the bytes, then
            //  raise the error code.  We use the IoStatus in the Irp instead of
            //  our structure since this Irp will not be completed.
            //

            if (!NT_SUCCESS( NewIrp->IoStatus.Status )) {

                DebugTrace( 0, Dbg, ("Couldn't write clusters to disk -> %08lx\n", NewIrp->IoStatus.Status) );

                NtfsRaiseStatus( IrpContext, NewIrp->IoStatus.Status, NULL, NULL );

            } else if (NewIrp->IoStatus.Information != Size) {

                DebugTrace( 0, Dbg, ("Couldn't write all byes to disk\n") );
                NtfsRaiseStatus( IrpContext, STATUS_UNEXPECTED_IO_ERROR, NULL, NULL );
            }

            //
            //  Cleanup the MDL
            //

            if (LockedUserBuffer) {
                MmUnlockPages( NewIrp->MdlAddress );
                LockedUserBuffer = FALSE;
                IoFreeMdl( NewIrp->MdlAddress );
            }
            NewIrp->MdlAddress = NULL;

            //
            //  Adjust offset and length
            //

            ByteCount -= Size;
            StartingVbo += Size;
        }

    } finally {

        DebugUnwind( NtfsWriteClusters );

        //
        //  Recover the Io Context and remember if it is from pool.
        //

        IrpContext->Union.NtfsIoContext = PreviousContext;

        SetFlag( IrpContext->State, FlagOn( State, IRP_CONTEXT_STATE_ALLOC_IO_CONTEXT ) );

        IrpContext->MajorFunction = MajorFunction;

        //
        //  If we allocated an Irp, we need to deallocate it.  We also
        //  have to return the correct function code to the Irp Context.
        //

        if (NewIrp != NULL) {

            //
            //  If there is an Mdl we free that first.
            //

            if (NewIrp->MdlAddress != NULL) {

                if (LockedUserBuffer) {

                    MmUnlockPages( NewIrp->MdlAddress );
                }

                IoFreeMdl( NewIrp->MdlAddress );
            }

            IoFreeIrp( NewIrp );
        }

        if (ZeroBuffer && Buffer) {
            NtfsFreePool( Buffer );
        }

        DebugTrace( -1, Dbg, ("NtfsWriteClusters:  Exit\n") );
    }

    return;
}


//
//  Local support routine
//

VOID
NtfsMultipleAsync (
    IN PIRP_CONTEXT IrpContext,
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP MasterIrp,
    IN ULONG MultipleIrpCount,
    IN PIO_RUN IoRuns,
    IN UCHAR IrpSpFlags
    )

/*++

Routine Description:

    This routine first does the initial setup required of a Master IRP that is
    going to be completed using associated IRPs.  This routine should not
    be used if only one async request is needed, instead the single read/write
    async routines should be called.

    A context parameter is initialized, to serve as a communications area
    between here and the common completion routine.  This initialization
    includes allocation of a spinlock.  The spinlock is deallocated in the
    NtfsWaitSync routine, so it is essential that the caller insure that
    this routine is always called under all circumstances following a call
    to this routine.

    Next this routine reads or writes one or more contiguous sectors from
    a device asynchronously, and is used if there are multiple reads for a
    master IRP.  A completion routine is used to synchronize with the
    completion of all of the I/O requests started by calls to this routine.

    Also, prior to calling this routine the caller must initialize the
    IoStatus field in the Context, with the correct success status and byte
    count which are expected if all of the parallel transfers complete
    successfully.  After return this status will be unchanged if all requests
    were, in fact, successful.  However, if one or more errors occur, the
    IoStatus will be modified to reflect the error status and byte count
    from the first run (by Vbo) which encountered an error.  I/O status
    from all subsequent runs will not be indicated.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    DeviceObject - Supplies the device to be read

    MasterIrp - Supplies the master Irp.

    MulitpleIrpCount - Supplies the number of multiple async requests
        that will be issued against the master irp.

    IoRuns - Supplies an array containing the Vbo, Lbo, BufferOffset, and
        ByteCount for all the runs to executed in parallel.

    IrpSpFlags - Flags to set in the irp stack location for the i/o - i.e write through


Return Value:

    None.

--*/

{
    PIRP Irp;
    PIO_STACK_LOCATION IrpSp;
    PMDL Mdl;
    PNTFS_IO_CONTEXT Context;
    ULONG TotalByteCount = 0;

    ULONG UnwindRunCount = 0;

    BOOLEAN Wait;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsMultipleAsync\n") );
    DebugTrace( 0, Dbg, ("MajorFunction    = %08lx\n", IrpContext->MajorFunction) );
    DebugTrace( 0, Dbg, ("DeviceObject     = %08lx\n", DeviceObject) );
    DebugTrace( 0, Dbg, ("MasterIrp        = %08lx\n", MasterIrp) );
    DebugTrace( 0, Dbg, ("MultipleIrpCount = %08lx\n", MultipleIrpCount) );
    DebugTrace( 0, Dbg, ("IoRuns           = %08lx\n", IoRuns) );

    //
    //  Set up things according to whether this is truely async.
    //

    Wait = (BOOLEAN) FlagOn( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

    Context = IrpContext->Union.NtfsIoContext;

    try {

        //
        //  Initialize Context, for use in Read/Write Multiple Asynch.
        //

        Context->MasterIrp = MasterIrp;

        //
        //  Iterate through the runs, doing everything that can fail
        //

        for ( UnwindRunCount = 0;
              UnwindRunCount < MultipleIrpCount;
              UnwindRunCount++ ) {

            //
            //  Create an associated IRP, making sure there is one stack entry for
            //  us, as well.
            //

            IoRuns[UnwindRunCount].SavedIrp = NULL;

            Irp = IoMakeAssociatedIrp( MasterIrp, (CCHAR)(DeviceObject->StackSize + 1) );

            if (Irp == NULL) {

                NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
            }

            IoRuns[UnwindRunCount].SavedIrp = Irp;

            //
            //  Allocate and build a partial Mdl for the request.
            //

            Mdl = IoAllocateMdl( (PCHAR)MasterIrp->UserBuffer +
                                 IoRuns[UnwindRunCount].BufferOffset,
                                 IoRuns[UnwindRunCount].ByteCount,
                                 FALSE,
                                 FALSE,
                                 Irp );

            if (Mdl == NULL) {

                NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
            }

            //
            //  Sanity Check
            //

            ASSERT( Mdl == Irp->MdlAddress );

            IoBuildPartialMdl( MasterIrp->MdlAddress,
                               Mdl,
                               (PCHAR)MasterIrp->UserBuffer +
                               IoRuns[UnwindRunCount].BufferOffset,
                               IoRuns[UnwindRunCount].ByteCount );

            //
            //  Get the first IRP stack location in the associated Irp
            //

            IoSetNextIrpStackLocation( Irp );
            IrpSp = IoGetCurrentIrpStackLocation( Irp );

            //
            //  Setup the Stack location to describe our read.
            //

            IrpSp->MajorFunction = IrpContext->MajorFunction;
            IrpSp->Parameters.Read.Length = IoRuns[UnwindRunCount].ByteCount;
            IrpSp->Parameters.Read.ByteOffset.QuadPart = IoRuns[UnwindRunCount].StartingVbo;

            //
            //  If this Irp is the result of a WriteThough operation,
            //  tell the device to write it through.
            //

            if (FlagOn(IrpContext->State, IRP_CONTEXT_STATE_WRITE_THROUGH)) {

                SetFlag( IrpSp->Flags, SL_WRITE_THROUGH );
            }

            //
            //  Set up the completion routine address in our stack frame.
            //

            IoSetCompletionRoutine( Irp,
                                    (Wait
                                     ? &NtfsMultiSyncCompletionRoutine
                                     : &NtfsMultiAsyncCompletionRoutine),
                                    Context,
                                    TRUE,
                                    TRUE,
                                    TRUE );

            //
            //  Setup the next IRP stack location in the associated Irp for the disk
            //  driver beneath us.
            //

            IrpSp = IoGetNextIrpStackLocation( Irp );

            //
            //  Setup the Stack location to do a read from the disk driver.
            //

            IrpSp->MajorFunction = IrpContext->MajorFunction;
            IrpSp->Flags = IrpSpFlags;
            IrpSp->Parameters.Read.Length = IoRuns[UnwindRunCount].ByteCount;
            IrpSp->Parameters.Read.ByteOffset.QuadPart = IoRuns[UnwindRunCount].StartingLbo;
            TotalByteCount += IoRuns[UnwindRunCount].ByteCount;
        }

        //
        //  We only need to set the associated IRP count in the master irp to
        //  make it a master IRP.  But we set the count to one more than our
        //  caller requested, because we do not want the I/O system to complete
        //  the I/O.  We also set our own count.
        //

        Context->IrpCount = MultipleIrpCount;
        MasterIrp->AssociatedIrp.IrpCount = MultipleIrpCount;
        IrpSp = IoGetCurrentIrpStackLocation( MasterIrp );
        IrpSp->Parameters.Read.Length = TotalByteCount;


        if (Wait) {

            MasterIrp->AssociatedIrp.IrpCount += 1;
        } else {

            //
            //  Convert the resource ownership to async before we do the i/o if
            //  we haven't already
            //

            if (IrpContext->Union.NtfsIoContext->Wait.Async.Resource &&
                !FlagOn( IrpContext->Union.NtfsIoContext->Wait.Async.ResourceThreadId, 3 )) {

                ASSERT( NtfsIsSharedResource( IrpContext->Union.NtfsIoContext->Wait.Async.Resource ) == 1 );

                IrpContext->Union.NtfsIoContext->Wait.Async.ResourceThreadId = (ERESOURCE_THREAD)MasterIrp | 3;
                ExSetResourceOwnerPointer( IrpContext->Union.NtfsIoContext->Wait.Async.Resource, (PVOID)IrpContext->Union.NtfsIoContext->Wait.Async.ResourceThreadId );
            }
        }

        //
        //  Now that all the dangerous work is done, issue the Io requests
        //

        for (UnwindRunCount = 0;
             UnwindRunCount < MultipleIrpCount;
             UnwindRunCount++) {

            Irp = IoRuns[UnwindRunCount].SavedIrp;

            //
            //  If IoCallDriver returns an error, it has completed the Irp
            //  and the error will be caught by our completion routines
            //  and dealt with as a normal IO error.
            //

            (VOID)IoCallDriver( DeviceObject, Irp );
        }

    } finally {

        ULONG i;

        DebugUnwind( NtfsMultipleAsync );

        //
        //  Only allocating the spinlock, making the associated Irps
        //  and allocating the Mdls can fail.
        //

        if (AbnormalTermination()) {

            //
            //  Unwind
            //

            for (i = 0; i <= UnwindRunCount; i++) {

                if ((Irp = IoRuns[i].SavedIrp) != NULL) {

                    if (Irp->MdlAddress != NULL) {

                        IoFreeMdl( Irp->MdlAddress );
                    }

                    IoFreeIrp( Irp );
                }
            }
        }

        //
        //  And return to our caller
        //

        DebugTrace( -1, Dbg, ("NtfsMultipleAsync -> VOID\n") );
    }

    return;
}


//
//  Local support routine
//

VOID
NtfsSingleAsync (
    IN PIRP_CONTEXT IrpContext,
    IN PDEVICE_OBJECT DeviceObject,
    IN LBO Lbo,
    IN ULONG ByteCount,
    IN PIRP Irp,
    IN UCHAR MajorFunction,
    IN UCHAR IrpSpFlags
    )

/*++

Routine Description:

    This routine reads or writes one or more contiguous sectors from a device
    asynchronously, and is used if there is only one read necessary to
    complete the IRP.  It implements the read by simply filling
    in the next stack frame in the Irp, and passing it on.  The transfer
    occurs to the single buffer originally specified in the user request.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    DeviceObject - Supplies the device to read

    Lbo - Supplies the starting Logical Byte Offset to begin reading from

    ByteCount - Supplies the number of bytes to read from the device

    Irp - Supplies the master Irp to associated with the async
          request.

    MajorFunction - IRP_MJ_READ || IRP_MJ_WRITE

    IrpSpFlags - flags to set in the irp stack location for the i/o like write through

Return Value:

    None.

--*/

{
    PIO_STACK_LOCATION IrpSp;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsSingleAsync\n") );
    DebugTrace( 0, Dbg, ("MajorFunction = %08lx\n", IrpContext->MajorFunction) );
    DebugTrace( 0, Dbg, ("DeviceObject  = %08lx\n", DeviceObject) );
    DebugTrace( 0, Dbg, ("Lbo           = %016I64x\n", Lbo) );
    DebugTrace( 0, Dbg, ("ByteCount     = %08lx\n", ByteCount) );
    DebugTrace( 0, Dbg, ("Irp           = %08lx\n", Irp) );


    //
    //  Set up the completion routine address in our stack frame.
    //

    IoSetCompletionRoutine( Irp,
                            (FlagOn( IrpContext->State, IRP_CONTEXT_STATE_WAIT )
                             ? &NtfsSingleSyncCompletionRoutine
                             : &NtfsSingleAsyncCompletionRoutine),
                            IrpContext->Union.NtfsIoContext,
                            TRUE,
                            TRUE,
                            TRUE );

    //
    //  Setup the next IRP stack location in the associated Irp for the disk
    //  driver beneath us.
    //

    IrpSp = IoGetNextIrpStackLocation( Irp );

    //
    //  Setup the Stack location to do a read from the disk driver.
    //

    IrpSp->MajorFunction = MajorFunction;
    IrpSp->Parameters.Read.Length = ByteCount;
    IrpSp->Parameters.Read.ByteOffset.QuadPart = Lbo;
    IrpSp->Flags = IrpSpFlags;

    //
    //  If this Irp is the result of a WriteThough operation,
    //  tell the device to write it through.
    //

    if (FlagOn(IrpContext->State, IRP_CONTEXT_STATE_WRITE_THROUGH)) {

        SetFlag( IrpSp->Flags, SL_WRITE_THROUGH );
    }

    //
    //  Convert the resource ownership to async before we do the i/o if
    //  we haven't already
    //

    if (!FlagOn( IrpContext->State, IRP_CONTEXT_STATE_WAIT ) &&
        IrpContext->Union.NtfsIoContext->Wait.Async.Resource &&
        !FlagOn( IrpContext->Union.NtfsIoContext->Wait.Async.ResourceThreadId, 3 )) {

        ASSERT( NtfsIsSharedResource( IrpContext->Union.NtfsIoContext->Wait.Async.Resource ) == 1 );

        IrpContext->Union.NtfsIoContext->Wait.Async.ResourceThreadId = (ERESOURCE_THREAD)Irp | 3;
        ExSetResourceOwnerPointer( IrpContext->Union.NtfsIoContext->Wait.Async.Resource, (PVOID)IrpContext->Union.NtfsIoContext->Wait.Async.ResourceThreadId );
    }

    //
    //  Issue the Io request
    //

    //
    //  If IoCallDriver returns an error, it has completed the Irp
    //  and the error will be caught by our completion routines
    //  and dealt with as a normal IO error.
    //

    (VOID)IoCallDriver( DeviceObject, Irp );

    //
    //  And return to our caller
    //

    DebugTrace( -1, Dbg, ("NtfsSingleAsync -> VOID\n") );

    return;
}


//
//  Local support routine
//

VOID
NtfsWaitSync (
    IN PIRP_CONTEXT IrpContext
    )

/*++

Routine Description:

    This routine waits for one or more previously started I/O requests
    from the above routines, by simply waiting on the event.

Arguments:

    Context - Pointer to Context used in previous call(s) to be waited on.

Return Value:

    None

--*/

{
    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsWaitSync:  Entered\n") );

    KeWaitForSingleObject( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent,
                           Executive,
                           KernelMode,
                           FALSE,
                           NULL );

    KeClearEvent( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent );

    DebugTrace( -1, Dbg, ("NtfsWaitSync -> VOID\n") );
}


//
//  Local support routine.
//

NTSTATUS
NtfsMultiAsyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    )

/*++

Routine Description:

    This is the completion routine for all asynchronous reads and writes
    started via NtfsMultipleAsynch.  It must synchronize its operation for
    multiprocessor environments with itself on all other processors, via
    a spin lock found via the Context parameter.

    The completion routine has has the following responsibilities:

        If the individual request was completed with an error, then
        this completion routine must see if this is the first error
        (essentially by Vbo), and if so it must correctly reduce the
        byte count and remember the error status in the Context.

        If the IrpCount goes to 1, then it sets the event in the Context
        parameter to signal the caller that all of the asynch requests
        are done.

Arguments:

    DeviceObject - Pointer to the file system device object.

    Irp - Pointer to the associated Irp which is being completed.  (This
          Irp will no longer be accessible after this routine returns.)

    Contxt - The context parameter which was specified for all of
             the multiple asynch I/O requests for this MasterIrp.

Return Value:

    Currently always returns STATUS_SUCCESS.

--*/

{

    PNTFS_IO_CONTEXT Context = Contxt;
    PIRP MasterIrp = Context->MasterIrp;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );
    BOOLEAN CompleteRequest = TRUE;

    UNREFERENCED_PARAMETER( DeviceObject );

    DebugTrace( +1, Dbg, ("NtfsMultiAsyncCompletionRoutine, Context = %08lx\n", Context) );

    //
    //  If we got an error (or verify required), remember it in the Irp
    //

    MasterIrp = Context->MasterIrp;

    if (!NT_SUCCESS( Irp->IoStatus.Status )) {

        MasterIrp->IoStatus = Irp->IoStatus;

        //
        //  Track any lower drivers that fail a paging file operation insuff. resources
        //

        if ((Irp->IoStatus.Status == STATUS_INSUFFICIENT_RESOURCES) &&
            FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO ) &&
            (IrpSp->MajorFunction == IRP_MJ_READ)) {

            NtfsFailedHandedOffPagingReads += 1;
        }
    }

    //
    //  Decrement IrpCount and see if it goes to zero.
    //

    if (InterlockedDecrement( &Context->IrpCount ) == 0) {

        PERESOURCE Resource;
        ERESOURCE_THREAD ResourceThreadId;

        //
        //  Capture the resource values out of the context to prevent
        //  colliding with the Fsp thread if we post this.
        //

        Resource = Context->Wait.Async.Resource;
        ResourceThreadId = Context->Wait.Async.ResourceThreadId;

        //
        //  Mark the master Irp pending
        //

        IoMarkIrpPending( MasterIrp );

        //
        //  If this request was successful or we posted an async paging io
        //  request then complete this irp.
        //

        if (FT_SUCCESS( MasterIrp->IoStatus.Status )) {

            //
            //  Do any necc. zeroing for read requests - if it fails then just complete
            //  the irp ZeroEndOfBuffer will put the error into the irp iostatus
            //

            if (NtfsZeroEndOfBuffer( MasterIrp, Context )) {
                MasterIrp->IoStatus.Information =
                    Context->Wait.Async.RequestedByteCount;

                //
                //  Go ahead an mark the File object to indicate that we performed
                //  either a read or write if this is not a paging io operation.
                //

                if (!FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO ) &&
                    (IrpSp->FileObject != NULL)) {

                    if (IrpSp->MajorFunction == IRP_MJ_READ) {

                        SetFlag( IrpSp->FileObject->Flags, FO_FILE_FAST_IO_READ );

                    } else {

                        SetFlag( IrpSp->FileObject->Flags, FO_FILE_MODIFIED );
                    }
                }
            }

        //
        //  If we had an error and will hot fix, we simply post the entire
        //  request.
        //

        } else if (!FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO )) {

            PIRP_CONTEXT IrpContext = NULL;

            //
            //  We need an IrpContext and then have to post the request.
            //  Use a try_except in case we fail the request for an IrpContext.
            //

            CompleteRequest = FALSE;
            try {

                NtfsInitializeIrpContext( MasterIrp, TRUE, &IrpContext );
                IrpContext->Union.NtfsIoContext = Context;
                SetFlag( IrpContext->State, IRP_CONTEXT_STATE_ALLOC_IO_CONTEXT );

                NtfsPostRequest( IrpContext, MasterIrp );

            } except( EXCEPTION_EXECUTE_HANDLER ) {

                //
                //  Just give up.
                //

                CompleteRequest = TRUE;

                if (IrpContext) {

                    //
                    //  We cleanup the context below.
                    //

                    IrpContext->Union.NtfsIoContext = NULL;
                    NtfsCleanupIrpContext( IrpContext, TRUE );
                }
            }
        }

        //
        //  Now release the resource
        //

        if (Resource != NULL) {

            ExReleaseResourceForThreadLite( Resource,
                                        ResourceThreadId );
        }

        if (CompleteRequest) {

            //
            //  and finally, free the context record.
            //

            ExFreeToNPagedLookasideList( &NtfsIoContextLookasideList, Context );
        }
    }

    DebugTrace( -1, Dbg, ("NtfsMultiAsyncCompletionRoutine\n") );

    //
    //  Return more processing required if we don't want the Irp to go away.
    //

    if (CompleteRequest) {

        return STATUS_SUCCESS;

    } else {

        //
        //  We need to cleanup the associated Irp and its Mdl.
        //

        IoFreeMdl( Irp->MdlAddress );
        IoFreeIrp( Irp );

        return STATUS_MORE_PROCESSING_REQUIRED;
    }
}


//
//  Local support routine.
//

NTSTATUS
NtfsMultiSyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    )

/*++

Routine Description:

    This is the completion routine for all synchronous reads and writes
    started via NtfsMultipleAsynch.  It must synchronize its operation for
    multiprocessor environments with itself on all other processors, via
    a spin lock found via the Context parameter.

    The completion routine has has the following responsibilities:

        If the individual request was completed with an error, then
        this completion routine must see if this is the first error
        (essentially by Vbo), and if so it must correctly reduce the
        byte count and remember the error status in the Context.

        If the IrpCount goes to 1, then it sets the event in the Context
        parameter to signal the caller that all of the asynch requests
        are done.

Arguments:

    DeviceObject - Pointer to the file system device object.

    Irp - Pointer to the associated Irp which is being completed.  (This
          Irp will no longer be accessible after this routine returns.)

    Contxt - The context parameter which was specified for all of
             the multiple asynch I/O requests for this MasterIrp.

Return Value:

    The routine returns STATUS_MORE_PROCESSING_REQUIRED so that we can
    immediately complete the Master Irp without being in a race condition
    with the IoCompleteRequest thread trying to decrement the IrpCount in
    the Master Irp.

--*/

{

    PNTFS_IO_CONTEXT Context = Contxt;
    PIRP MasterIrp = Context->MasterIrp;

    DebugTrace( +1, Dbg, ("NtfsMultiSyncCompletionRoutine, Context = %08lx\n", Context) );

    //
    //  If we got an error (or verify required), remember it in the Irp
    //

    MasterIrp = Context->MasterIrp;

    if (!NT_SUCCESS( Irp->IoStatus.Status )) {

        MasterIrp->IoStatus = Irp->IoStatus;

        //
        //  Track any lower drivers that fail a paging file operation insuff. resources
        //

        if ((Irp->IoStatus.Status == STATUS_INSUFFICIENT_RESOURCES) &&
            FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO ) &&
            (IoGetCurrentIrpStackLocation( Irp )->MajorFunction == IRP_MJ_READ)) {

            NtfsFailedHandedOffPagingReads += 1;
        }
    }

    //
    //  We must do this here since IoCompleteRequest won't get a chance
    //  on this associated Irp.
    //

    IoFreeMdl( Irp->MdlAddress );
    IoFreeIrp( Irp );

    if (InterlockedDecrement(&Context->IrpCount) == 0) {

        KeSetEvent( &Context->Wait.SyncEvent, 0, FALSE );
    }

    DebugTrace( -1, Dbg, ("NtfsMultiSyncCompletionRoutine -> STATUS_MORE_PROCESSING_REQUIRED\n") );

    return STATUS_MORE_PROCESSING_REQUIRED;

    UNREFERENCED_PARAMETER( DeviceObject );
}


//
//  Local support routine.
//

NTSTATUS
NtfsSingleAsyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    )

/*++

Routine Description:

    This is the completion routine for all asynchronous reads and writes
    started via NtfsSingleAsynch.

    The completion routine has has the following responsibilities:

        Copy the I/O status from the Irp to the Context, since the Irp
        will no longer be accessible.

        It sets the event in the Context parameter to signal the caller
        that all of the asynch requests are done.

Arguments:

    DeviceObject - Pointer to the file system device object.

    Irp - Pointer to the Irp for this request.  (This Irp will no longer
    be accessible after this routine returns.)

    Contxt - The context parameter which was specified in the call to
             NtfsSingleAsynch.

Return Value:

    Currently always returns STATUS_SUCCESS.

--*/

{
    PNTFS_IO_CONTEXT Context = Contxt;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );
    BOOLEAN CompleteRequest = TRUE;

    PERESOURCE Resource;
    ERESOURCE_THREAD ResourceThreadId;

    UNREFERENCED_PARAMETER( DeviceObject );

    DebugTrace( +1, Dbg, ("NtfsSingleAsyncCompletionRoutine, Context = %08lx\n", Context) );

    //
    //  Capture the resource values out of the context to prevent
    //  colliding with the Fsp thread if we post this.
    //

    Resource = Context->Wait.Async.Resource;
    ResourceThreadId = Context->Wait.Async.ResourceThreadId;

    //
    //  Mark the Irp pending
    //

    IoMarkIrpPending( Irp );

    //
    //  Fill in the information field correctedly if this worked.
    //

    if (FT_SUCCESS( Irp->IoStatus.Status )) {

        //
        //  Zero the difference between filesize and data read if necc. on reads
        //  if it fails just complete the irp - ZeroEndOfBuffer will put the error into the
        //  irp
        //

        if (NtfsZeroEndOfBuffer( Irp, Context )) {
            Irp->IoStatus.Information = Context->Wait.Async.RequestedByteCount;

            //
            //  Go ahead an mark the File object to indicate that we performed
            //  either a read or write.
            //

            if (!FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO ) &&
                (IrpSp->FileObject != NULL)) {

                if (IrpSp->MajorFunction == IRP_MJ_READ) {

                    SetFlag( IrpSp->FileObject->Flags, FO_FILE_FAST_IO_READ );

                } else {

                    SetFlag( IrpSp->FileObject->Flags, FO_FILE_MODIFIED );
                }
            }
        }

    //
    //  If we had an error and will hot fix, we simply post the entire
    //  request.
    //

    } else if (!FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO )) {

        PIRP_CONTEXT IrpContext = NULL;

        //
        //  We need an IrpContext and then have to post the request.
        //  Use a try_except in case we fail the request for an IrpContext.
        //

        CompleteRequest = FALSE;
        try {

            NtfsInitializeIrpContext( Irp, TRUE, &IrpContext );
            IrpContext->Union.NtfsIoContext = Context;
            SetFlag( IrpContext->State, IRP_CONTEXT_STATE_ALLOC_IO_CONTEXT );

            NtfsPostRequest( IrpContext, Irp );

        } except( EXCEPTION_EXECUTE_HANDLER ) {

            //
            //  Just give up.
            //

            CompleteRequest = TRUE;

            if (IrpContext) {

                //
                //  We cleanup the context below.
                //

                IrpContext->Union.NtfsIoContext = NULL;
                NtfsCleanupIrpContext( IrpContext, TRUE );
            }
        }
    } else if ((Irp->IoStatus.Status == STATUS_INSUFFICIENT_RESOURCES) &&
               (IrpSp->MajorFunction == IRP_MJ_READ)) {

        //
        //  Track any lower drivers that fail a paging file operation insuff. resources
        //

        NtfsFailedHandedOffPagingReads += 1;
    }

    //
    //  Now release the resource
    //

    if (Resource != NULL) {

        ExReleaseResourceForThreadLite( Resource,
                                    ResourceThreadId );
    }

    //
    //  and finally, free the context record.
    //

    DebugTrace( -1, Dbg, ("NtfsSingleAsyncCompletionRoutine -> STATUS_SUCCESS\n") );

    if (CompleteRequest) {

        ExFreeToNPagedLookasideList( &NtfsIoContextLookasideList, Context );
        return STATUS_SUCCESS;

    } else {

        return STATUS_MORE_PROCESSING_REQUIRED;
    }

}


//
//  Local support routine.
//

NTSTATUS
NtfsSingleSyncCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Contxt
    )

/*++

Routine Description:

    This is the completion routine for all reads and writes started via
    NtfsSingleAsynch.

    The completion routine has has the following responsibilities:

        Copy the I/O status from the Irp to the Context, since the Irp
        will no longer be accessible.

        It sets the event in the Context parameter to signal the caller
        that all of the asynch requests are done.

Arguments:

    DeviceObject - Pointer to the file system device object.

    Irp - Pointer to the Irp for this request.  (This Irp will no longer
    be accessible after this routine returns.)

    Contxt - The context parameter which was specified in the call to
             NtfsSingleAsynch.

Return Value:

    The routine returns STATUS_MORE_PROCESSING_REQUIRED so that we can
    immediately complete the Master Irp without being in a race condition
    with the IoCompleteRequest thread trying to decrement the IrpCount in
    the Master Irp.

--*/

{
    PNTFS_IO_CONTEXT Context = Contxt;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );

    //
    //  Track any lower drivers that fail a paging file operation insuff. resources
    //

    if ((Irp->IoStatus.Status == STATUS_INSUFFICIENT_RESOURCES) &&
        FlagOn( Context->Flags, NTFS_IO_CONTEXT_PAGING_IO ) &&
        (IrpSp->MajorFunction == IRP_MJ_READ)) {

        NtfsFailedHandedOffPagingReads += 1;
    }

    KeSetEvent( &Context->Wait.SyncEvent, 0, FALSE );

    DebugTrace( -1, Dbg, ("NtfsSingleCompletionRoutine -> STATUS_MORE_PROCESSING_REQUIRED\n") );

    return STATUS_MORE_PROCESSING_REQUIRED;

    UNREFERENCED_PARAMETER( DeviceObject );
}


//
//  Local support routine.
//

NTSTATUS
NtfsPagingFileCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID MasterIrp
    )

/*++

Routine Description:

    This is the completion routine for all reads and writes started via
    NtfsPagingFileIo.

    The completion routine has has the following responsibility:

        Since the individual request was completed with an error,
        this completion routine must stuff it into the master irp.

Arguments:

    DeviceObject - Pointer to the file system device object.

    Irp - Pointer to the associated Irp which is being completed.  (This
          Irp will no longer be accessible after this routine returns.)

    MasterIrp - Pointer to the master Irp.  The low order bit in this value will
        be set if a higher level call is performing a hot-fix.

Return Value:

    Always returns STATUS_SUCCESS.

--*/

{
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );

    UNREFERENCED_PARAMETER( DeviceObject );

    DebugTrace( +1, Dbg, ("NtfsPagingFileCompletionRoutine, MasterIrp = %08lx\n", MasterIrp) );

    ASSERT( (Irp->IoStatus.Status != STATUS_INSUFFICIENT_RESOURCES) ||
            (IrpSp->Parameters.Read.Length > PAGE_SIZE) );

    if (!FT_SUCCESS( Irp->IoStatus.Status )) {

        //
        //  Track any lower drivers that fail a paging file operation insuff. resources
        //

        if (Irp->IoStatus.Status == STATUS_INSUFFICIENT_RESOURCES) {
            NtfsFailedHandedOffPagingFileOps += 1;
        }

        if (!FsRtlIsTotalDeviceFailure( Irp->IoStatus.Status ) &&
            (Irp->IoStatus.Status != STATUS_VERIFY_REQUIRED) &&
            !FlagOn( (ULONG_PTR) MasterIrp, 0x1 )) {

            if  (Irp->IoStatus.Status == STATUS_FT_READ_RECOVERY_FROM_BACKUP) {

                //
                //  If the volume manager has actually completed the read
                //  from a backup, there's little point in telling MM about that.
                //

                Irp->IoStatus.Status = STATUS_SUCCESS;
            }

            //
            //  We don't want to try to hotfix READ errors on the paging file
            //  because of deadlock possibilities with MM. Instead we'll just
            //  return the error for MM to deal with. Chances are that
            //  MM (eg. MiWaitForInPageComplete) will bugcheck anyway,
            //  but it's still nicer than walking right into the deadlock.
            //  We also only asynchronously fix write errors and just return the error
            //  back for mm to retry elsewhere
            //

            if (IrpSp->MajorFunction != IRP_MJ_READ) {

                VBO BadVbo;

                BadVbo = IrpSp->Parameters.Read.Key;

                NtfsPostHotFix( Irp,
                                &BadVbo,
                                IrpSp->Parameters.Read.ByteOffset.QuadPart,
                                IrpSp->Parameters.Read.Length,
                                FALSE );
            }
        }

        //
        //  If we got an error (or verify required), remember it in the Irp
        //

        ClearFlag( (ULONG_PTR) MasterIrp, 0x1 );
        ((PIRP) MasterIrp)->IoStatus = Irp->IoStatus;
    }

    DebugTrace( -1, Dbg, ("NtfsPagingFileCompletionRoutine => (STATUS_SUCCESS)\n") );

    return STATUS_SUCCESS;
}


//
//  Local support routine.
//

NTSTATUS
NtfsPagingFileNoAllocCompletionRoutine (
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP Irp,
    IN PVOID Context
    )

/*++

Routine Description:

    This is the completion routine for all reads and writes started via
    NtfsPagingFileIoNoAllocation.

    The completion routine signals back to the main routine and stops processing

Arguments:

    DeviceObject - Pointer to the file system device object.

    Irp - Pointer to the associated Irp which is being completed.  (This
          Irp will no longer be accessible after this routine returns.)

    Context -  Actually the event to signal

Return Value:

    Always returns STATUS_SUCCESS.

--*/

{
    PKEVENT Event = (PKEVENT) Context;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation( Irp );

    ASSERT( (Irp->IoStatus.Status != STATUS_INSUFFICIENT_RESOURCES) ||
            (IrpSp->Parameters.Read.Length > PAGE_SIZE) );

    //
    //  Track any lower drivers that fail a paging file operation insuff. resources
    //

    if (Irp->IoStatus.Status == STATUS_INSUFFICIENT_RESOURCES) {
        NtfsFailedHandedOffPagingFileOps += 1;
    }

    KeSetEvent( Event, IO_NO_INCREMENT, FALSE );
    return STATUS_MORE_PROCESSING_REQUIRED;

    UNREFERENCED_PARAMETER( DeviceObject );
    UNREFERENCED_PARAMETER( Irp );
}


//
//  Local support routine
//

VOID
NtfsSingleNonAlignedSync (
    IN PIRP_CONTEXT IrpContext,
    IN PVCB Vcb,
    IN PSCB Scb,
    IN PUCHAR Buffer,
    IN VBO Vbo,
    IN LBO Lbo,
    IN ULONG ByteCount,
    IN PIRP Irp
    )

/*++

Routine Description:

    This routine reads or writes one or more contiguous sectors from a device
    Synchronously, and does so to a buffer that must come from non paged
    pool.  It saves a pointer to the Irp's original Mdl, and creates a new
    one describing the given buffer.  It implements the read by simply filling
    in the next stack frame in the Irp, and passing it on.  The transfer
    occurs to the single buffer originally specified in the user request.

    Currently, only reads are supported.

Arguments:

    IrpContext->MajorFunction - Supplies either IRP_MJ_READ or IRP_MJ_WRITE.

    Vcb - Supplies the device to read

    Scb - Supplies the Scb to read

    Buffer - Supplies a buffer from non-paged pool.

    Vbo - Supplies the starting Virtual Block Offset to begin reading from

    Lbo - Supplies the starting Logical Block Offset to begin reading from

    ByteCount - Supplies the number of bytes to read from the device

    Irp - Supplies the master Irp to associated with the async
          request.

    Context - Asynchronous I/O context structure

Return Value:

    None.

--*/

{
    PIO_STACK_LOCATION IrpSp;

    PMDL Mdl;
    PMDL SavedMdl;

    PAGED_CODE();

    DebugTrace( +1, Dbg, ("NtfsSingleNonAlignedSync\n") );
    DebugTrace( 0, Dbg, ("MajorFunction = %08lx\n", IrpContext->MajorFunction) );
    DebugTrace( 0, Dbg, ("Vcb           = %08lx\n", Vcb) );
    DebugTrace( 0, Dbg, ("Buffer        = %08lx\n", Buffer) );
    DebugTrace( 0, Dbg, ("Lbo           = %016I64x\n", Lbo) );
    DebugTrace( 0, Dbg, ("ByteCount     = %08lx\n", ByteCount) );
    DebugTrace( 0, Dbg, ("Irp           = %08lx\n", Irp) );

    //
    //  Create a new Mdl describing the buffer, saving the current one in the
    //  Irp
    //

    SavedMdl = Irp->MdlAddress;

    Irp->MdlAddress = 0;

    Mdl = IoAllocateMdl( Buffer,
                         ByteCount,
                         FALSE,
                         FALSE,
                         Irp );

    if (Mdl == NULL) {

        Irp->MdlAddress = SavedMdl;

        NtfsRaiseStatus( IrpContext, STATUS_INSUFFICIENT_RESOURCES, NULL, NULL );
    }

    //
    //  Lock the new Mdl in memory.
    //

    try {

        MmProbeAndLockPages( Mdl, KernelMode, IoWriteAccess );

    } finally {

        if (AbnormalTermination()) {

            IoFreeMdl( Mdl );
            Irp->MdlAddress = SavedMdl;
        }
    }

    //
    // Set up the completion routine address in our stack frame.
    //

    IoSetCompletionRoutine( Irp,
                            &NtfsSingleSyncCompletionRoutine,
                            IrpContext->Union.NtfsIoContext,
                            TRUE,
                            TRUE,
                            TRUE );

    //
    //  Setup the next IRP stack location in the associated Irp for the disk
    //  driver beneath us.
    //

    IrpSp = IoGetNextIrpStackLocation( Irp );

    //
    //  Setup the Stack location to do a read from the disk driver.
    //

    IrpSp->MajorFunction = IrpContext->MajorFunction;
    IrpSp->Parameters.Read.Length = ByteCount;
    IrpSp->Parameters.Read.ByteOffset.QuadPart = Lbo;

    //
    // Initialize the Kernel Event in the context structure so that the
    // caller can wait on it.  Set remaining pointers to NULL.
    //

    KeInitializeEvent( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent,
                       NotificationEvent,
                       FALSE );

    //
    //  Issue the read request
    //
    //  If IoCallDriver returns an error, it has completed the Irp
    //  and the error will be caught by our completion routines
    //  and dealt with as a normal IO error.
    //

    try {

        (VOID)IoCallDriver( Vcb->TargetDeviceObject, Irp );

        NtfsWaitSync( IrpContext );

        //
        //  See if we need to do a hot fix.
        //

        if (!FT_SUCCESS(Irp->IoStatus.Status)) {

            IO_RUN IoRun;

            IoRun.StartingVbo = Vbo;
            IoRun.StartingLbo = Lbo;
            IoRun.BufferOffset = 0;
            IoRun.ByteCount = ByteCount;
            IoRun.SavedIrp = NULL;

            //
            //  Try to fix the problem
            //

            NtfsFixDataError( IrpContext,
                              Scb,
                              Vcb->TargetDeviceObject,
                              Irp,
                              1,
                              &IoRun,
                              0 );
        }

    } finally {

        MmUnlockPages( Mdl );

        IoFreeMdl( Mdl );

        Irp->MdlAddress = SavedMdl;
    }

    //
    //  And return to our caller
    //

    DebugTrace( -1, Dbg, ("NtfsSingleNonAlignedSync -> VOID\n") );

    return;
}


//
//  Local support routine
//

NTSTATUS
NtfsEncryptBuffers (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp,
    IN PSCB Scb,
    IN VBO StartingVbo,
    IN ULONG NumberRuns,
    IN PCOMPRESSION_CONTEXT CompressionContext
    )

/*++

Routine Description:

    This routine is called by NtfsPrepareBuffers during a write
    operation on an encrypted file.  It allocates a compression
    buffer if necessary and calls the encyrption callout routine
    to compress each run of data in the CompressionContext.

Arguments:

    Irp - Supplies the requesting Irp.

    Scb - Supplies the stream file to act on.

    StartingVbo - The starting point for the operation.

    ByteCount - The lengh of the operation.

    NumberRuns - The size of the IoRuns array in the compression context.

    CompressionContext - Supplies the CompressionContext for this stream.

Return Value:

    None.

--*/

{
    ULONG Run;
    ULONG BufferSize;
    LARGE_INTEGER OffsetWithinFile;
    PIO_RUN IoRun;

    PUCHAR SourceBuffer;
    PUCHAR DestinationBuffer;

    NTSTATUS Status;

    ASSERT( NumberRuns > 0 );
    ASSERT( IrpContext->MajorFunction == IRP_MJ_WRITE );

    //
    //  These functions are just for debugging purposes.  We need to call them
    //  somewhere so the compiler doesn't optimize them out as unreferenced functions.
    //

#ifdef EFSDBG
    if (CompressionContext->SystemBufferOffset != 0) {

        DebugTrace( 0, Dbg, ("\nEncryptBuffers: SystemBufferOffset = %x", CompressionContext->SystemBufferOffset) );
    }
#endif

    //
    //  If we have not already mapped the user buffer, then do so.
    //

    if (CompressionContext->SystemBuffer == NULL) {

        CompressionContext->SystemBuffer = NtfsMapUserBuffer( Irp, NormalPagePriority );
    }

    //
    //  For uncompressed files, we may not have a buffer allocated yet.
    //  The buffer needs to be big enough for this entire transfer.
    //  It must be big enough to go from StartingVbo for this
    //  transfer to the end of the last iorun for this transfer.
    //

    BufferSize = (ULONG) ((CompressionContext->IoRuns[NumberRuns-1].StartingVbo +
                            CompressionContext->IoRuns[NumberRuns-1].ByteCount) -
                          StartingVbo);

    if (BufferSize > LARGE_BUFFER_SIZE) {

        BufferSize = LARGE_BUFFER_SIZE;
    }

    //
    //  If the data already got transformed, the buffer should still be allocated.
    //

    ASSERT( (!CompressionContext->DataTransformed) ||
            (CompressionContext->CompressionBuffer != NULL) );

    //
    //  This function conveniently only allocates/reallocates the buffer
    //  if there is not one allocated yet or if the existing one is not
    //  big enough.
    //

    NtfsAllocateCompressionBuffer( IrpContext,
                                   Scb,
                                   Irp,
                                   CompressionContext,
                                   &BufferSize );

    //
    //  If the data has already be transformed into the compression buffer, for
    //  a compressed or sparse file, for instance, we want to work with the
    //  transformed data.  Otherwise, we need to pluck it directly out of the
    //  system buffer.
    //

    if (CompressionContext->DataTransformed) {

        SourceBuffer = DestinationBuffer = CompressionContext->CompressionBuffer;

    } else {

        SourceBuffer = Add2Ptr( CompressionContext->SystemBuffer, CompressionContext->SystemBufferOffset );
        DestinationBuffer = CompressionContext->CompressionBuffer;
    }

    //
    //  Now look at each run of real data heading to the disk and
    //  let the encryption driver encrypt it.
    //

    for (Run = 0; Run < NumberRuns; Run++) {

        IoRun = &CompressionContext->IoRuns[Run];
        OffsetWithinFile.QuadPart = IoRun->StartingVbo;

        Status = NtfsData.EncryptionCallBackTable.BeforeWriteProcess( Add2Ptr(SourceBuffer, IoRun->BufferOffset),
                                                                      Add2Ptr(DestinationBuffer, IoRun->BufferOffset),
                                                                      &OffsetWithinFile,
                                                                      IoRun->ByteCount,
                                                                      Scb->EncryptionContext);
        if (!NT_SUCCESS( Status )) {

            return Status;
        }
    }

    return STATUS_SUCCESS;
}


VOID
NtfsFixDataError (
    IN PIRP_CONTEXT IrpContext,
    IN PSCB Scb,
    IN PDEVICE_OBJECT DeviceObject,
    IN PIRP MasterIrp,
    IN ULONG MultipleIrpCount,
    IN PIO_RUN IoRuns,
    IN UCHAR IrpSpFlags
    )

/*

Routine Description:

    This routine is called when a read error, write error, or Usa error
    is received when doing noncached I/O on a stream.  It attempts to
    recover from Usa errors if FT is present.  For bad clusters it attempts
    to isolate the error to one or more bad clusters, for which hot fix
    requests are posted.

Arguments:

    Scb - Supplies the Scb for the stream which got the error

    DeviceObject - Supplies the Device Object for the stream

    MasterIrp - Supplies the original master Irp for the failing read or write

    MultipleIrpCount - Supplies the number of runs in which the current
                       was broken into at the time the error occured.

    IoRuns - Supplies an array describing the runs being accessed at the
             time of the error

    IrpSpFlags - flags to set in irp stack location for the i/o like write_through

Return Value:

    None

-*/

{
    ULONG RunNumber, FtCase;
    ULONG ByteOffset = MAXULONG;
    ULONG ClusterMask;
    ULONG ClustersToRecover;
    ULONG UsaBlockSize;
    PIO_STACK_LOCATION IrpSp;
    PVCB Vcb = Scb->Vcb;
    ULONG BytesPerCluster = Vcb->BytesPerCluster;
    NTSTATUS FinalStatus = STATUS_SUCCESS;
    ULONG AlignedRunNumber = 0;
    ULONG AlignedByteOffset = 0;
    NTSTATUS IrpStatus = MasterIrp->IoStatus.Status;
    PTOP_LEVEL_CONTEXT TopLevelContext;

    PNTFS_IO_CONTEXT Context = IrpContext->Union.NtfsIoContext;
    PMULTI_SECTOR_HEADER MultiSectorHeader;

    UCHAR Buffer[sizeof( MDL ) + sizeof( PFN_NUMBER ) * 2];
    PMDL PartialMdl = (PMDL) Buffer;

    LONGLONG LlTemp1;
    LONGLONG LlTemp2;
    ULONG Priority = NormalPagePriority;

    BOOLEAN SecondaryAvailable;
    BOOLEAN FixingUsaError;
    BOOLEAN FinalPass;
    BOOLEAN ReservedMapping = FALSE;

    PAGED_CODE();

    //
    //  First, if the error we got indicates a total device failure, then we
    //  just report it rather than trying to hot fix every sector on the volume!
    //  Also, do not do hot fix for the read ahead thread, because that is a
    //  good way to conceal errors from the App.
    //

    if (FsRtlIsTotalDeviceFailure( MasterIrp->IoStatus.Status ) ||
        (Scb->CompressionUnit != 0)) {

        return;
    }

    //
    //  Get out if we got an error and the current thread is doing read ahead.
    //

    if (!NT_SUCCESS( MasterIrp->IoStatus.Status ) && NtfsIsReadAheadThread()) {

        return;
    }

    //
    //  Also get out if the top level request came from the fast io path.
    //

    TopLevelContext = NtfsGetTopLevelContext();

    if (TopLevelContext->SavedTopLevelIrp == (PIRP) FSRTL_FAST_IO_TOP_LEVEL_IRP) {

        return;
    }

    //
    //  We can't hot fix the mft mirror or the boot file.  If we're in here
    //  for one of those files, we have to get out now.  We'll make sure we
    //  aren't trying to hot fix the beginning of the mft itself just before
    //  we call NtfsPostHotFix down below.
    //

    ASSERT (Scb != NULL);

    if ((Scb == Vcb->Mft2Scb) ||
        (NtfsEqualMftRef( &Scb->Fcb->FileReference, &BootFileReference ) &&
         (Scb->AttributeTypeCode == $DATA))) {

        return;
    }

    //
    //  Determine whether a secondary device is available
    //

    SecondaryAvailable = (BOOLEAN)!FlagOn( Vcb->VcbState, VCB_STATE_NO_SECONDARY_AVAILABLE );

    //
    //  Assume that we are recovering from a Usa error, if the MasterIrp has
    //  the success status.
    //

    FixingUsaError = FT_SUCCESS( MasterIrp->IoStatus.Status );

    //
    //  We cannot fix any Usa errors if there is no secondary.  Even if there is
    //  a secondary, Usa errors should only occur during restart.  If it is not
    //  restart we are probably looking at uninitialized data, so don't try to
    //  "fix" it.
    //

    if (FixingUsaError &&
        (!SecondaryAvailable || !FlagOn( Vcb->VcbState, VCB_STATE_RESTART_IN_PROGRESS ))) {
        return;
    }

    //
    //  If there is no secondary available and this is a user non-cached read then simply
    //  return the error.  Give this user a chance to re-write the sector himself using
    //  non-cached io.
    //

    if (!SecondaryAvailable &&
        (IrpContext->MajorFunction == IRP_MJ_READ) &&
        (FlagOn( MasterIrp->Flags, IRP_PAGING_IO | IRP_NOCACHE ) == IRP_NOCACHE)) {

        return;
    }

    //
    //  No hot fixing at all if the volume is read only.
    //

    if (NtfsIsVolumeReadOnly( Vcb )) {

        return;
    }

    //
    //  Initialize Context, for use in Read/Write Multiple Asynch.
    //

    ASSERT( Context != NULL );

    Context->MasterIrp = MasterIrp;
    KeInitializeEvent( &Context->Wait.SyncEvent, NotificationEvent, FALSE );

    HotFixTrace(("NtfsFixDataError, MasterIrp: %08lx, MultipleIrpCount: %08lx\n", MasterIrp, MultipleIrpCount));
    HotFixTrace(("                  IoRuns: %08lx, UsaError: %02lx\n", IoRuns, FixingUsaError));
    HotFixTrace(("                  Thread: %08lx\n", PsGetCurrentThread()));
    HotFixTrace(("                  Scb:    %08lx   BadClusterScb:  %08lx\n", Scb, Vcb->BadClusterFileScb));

    //
    //  If this is a Usa-protected structure, get the block size now.
    //

    if (FlagOn( Scb->ScbState, SCB_STATE_USA_PRESENT )) {

        //
        //  Get the the number of blocks, based on what type of stream it is.
        //  First check for Mft or Log file.
        //

        if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_MFT) {

            ASSERT( (Scb == Vcb->MftScb) || (Scb == Vcb->Mft2Scb) );

            UsaBlockSize = Vcb->BytesPerFileRecordSegment;

        } else if (Scb->Header.NodeTypeCode == NTFS_NTC_SCB_DATA) {

            //
            //  For the log file, we will just go a page at a time, which
            //  is generally what the log file does.  Any USA errors would
            //  tend to be only at the logical end of the log file anyway.
            //

            ASSERT( Scb == Vcb->LogFileScb );

            //
            //  We need to peek at the page so map it in
            //

            MultiSectorHeader = (PMULTI_SECTOR_HEADER) NtfsMapUserBufferNoRaise( MasterIrp, HighPagePriority );

            //
            //  We can't map the user buffer due to low resources - so switch to using the reserved
            //  mapping instead
            //

            if (MultiSectorHeader == NULL) {

                ExAcquireFastMutexUnsafe( &Vcb->ReservedMappingMutex );
                ReservedMapping = TRUE;

                MmInitializeMdl( PartialMdl, NULL, 2 * PAGE_SIZE );
                IoBuildPartialMdl( MasterIrp->MdlAddress, PartialMdl, Add2Ptr( MmGetMdlBaseVa( MasterIrp->MdlAddress ), MmGetMdlByteOffset( MasterIrp->MdlAddress )), Vcb->BytesPerSector );
                MultiSectorHeader = (PMULTI_SECTOR_HEADER) MmMapLockedPagesWithReservedMapping( IrpContext->Vcb->ReservedMapping,
                                                                                                RESERVE_POOL_TAG,
                                                                                                PartialMdl,
                                                                                                MmCached );
                ASSERT( MultiSectorHeader != NULL );
            }

            //
            //  For the log file, assume it is right in the record, use that
            //  if we get a plausible number, else use page size.
            //

            RunNumber = MultiSectorHeader->UpdateSequenceArraySize - 1;
            UsaBlockSize = RunNumber * SEQUENCE_NUMBER_STRIDE;

            if ((UsaBlockSize != 0x1000) && (UsaBlockSize != 0x2000) && (UsaBlockSize != PAGE_SIZE)) {

                UsaBlockSize = PAGE_SIZE;
            }

            //
            //  Drop the reserved mapping - since we're done with the multi-sector header
            //

            if (ReservedMapping) {
                MmUnmapReservedMapping( Vcb->ReservedMapping, RESERVE_POOL_TAG, PartialMdl );
                MmPrepareMdlForReuse( PartialMdl );
                ExReleaseFastMutexUnsafe( &Vcb->ReservedMappingMutex );
                ReservedMapping = FALSE;
                MultiSectorHeader = NULL;
            }

        //
        //  Otherwise it is an index, so we can get the count out of the Scb.
        //

        } else {

            UsaBlockSize = Scb->ScbType.Index.BytesPerIndexBuffer;
        }

        //
        //  Verify the maximum of UsaBlockSize and cluster size.
        //

        if (BytesPerCluster > UsaBlockSize) {

            //
            //  Determine which is smaller the cluster size or the
            //  size of the buffer being read.
            //

            IrpSp = IoGetCurrentIrpStackLocation( MasterIrp );

            UsaBlockSize = IrpSp->Parameters.Read.Length;
            if (UsaBlockSize > BytesPerCluster) {

                UsaBlockSize = BytesPerCluster;
            }
        }
    }

    //
    //  We know we got a failure in the given transfer, which could be any size.
    //  We first want to localize the error to the failing cluster(s).
    //
    //  We do this in the following nested loops:
    //
    //      do (for the entire transfer, 32 clusters at a time)
    //
    //          for (primary, secondary if available, primary again if necessary)
    //
    //              for (each run)
    //
    //                  for (each cluster)
    //
    //  The inner-most two loops above have the ability to restart on successive
    //  32-cluster boundaries, relative to the first cluster in the transfer.
    //  For the Ft case, where there is a secondary device available, clusters
    //  are blocked out of a mask as errors are found and corrected, so they
    //  do not have to be read in successive passes; Usa errors are blocked out
    //  of the mask immediately, while for I/O errors we force ourselves to read
    //  both copies to locate the error, only reading the primary again if the
    //  secondary contained the error.
    //

    //
    //  Loop through the entire transfer, 32 clusters at a time.  The innermost
    //  loops will terminate on 32 cluster boundaries, so the outermost loop
    //  will simply keep looping until we exhaust the IoRuns array.
    //

    do {

        //
        //  Initialize the clusters to recover to "all".
        //

        ClustersToRecover = MAXULONG;
        FinalPass = FALSE;

        //
        //  For these 32 clusters, loop through primary, secondary (if available),
        //  and primary again (only reading when necessary).
        //

        for (FtCase = 0; !FinalPass; FtCase++) {

            //
            //  Calculate whether this is the final pass or not.
            //

            FinalPass = !SecondaryAvailable ||
                        (FtCase == 2) ||
                        (IrpContext->MajorFunction == IRP_MJ_WRITE);

            //
            //  Initialize the current cluster mask for cluster 0
            //

            ClusterMask = 1;

            //
            //  Loop through all of the runs in the IoRuns array, or until the
            //  ClusterMask indicates that we hit a 32 cluster boundary.
            //

            for (RunNumber = AlignedRunNumber;
                 (RunNumber < MultipleIrpCount) && (ClusterMask != 0);
                 (ClusterMask != 0) ? RunNumber++ : 0) {

                //
                //  Loop through all of the clusters within this run, or until
                //  the ClusterMask indicates that we hit a 32 cluster boundary.
                //

                for (ByteOffset = (RunNumber == AlignedRunNumber) ? AlignedByteOffset : 0;
                     (ByteOffset < IoRuns[RunNumber].ByteCount) && (ClusterMask != 0);
                     ByteOffset += BytesPerCluster, ClusterMask <<= 1) {

                    LONGLONG StartingVbo, StartingLbo;
                    PIRP Irp;
                    PMDL Mdl;
                    BOOLEAN LowFileRecord;
                    FT_SPECIAL_READ SpecialRead;
                    ULONG Length;

                    HotFixTrace(("Doing ByteOffset: %08lx for FtCase: %02lx\n",
                                (((ULONG)IoRuns[RunNumber].StartingVbo) + ByteOffset),
                                FtCase));

                    //
                    //  If this cluster no longer needs to be recovered, we can
                    //  skip it.
                    //

                    if ((ClustersToRecover & ClusterMask) == 0) {
                        continue;
                    }

                    //
                    //  Temporarily get the 64-bit byte offset into StartingVbo, then
                    //  calculate the actual StartingLbo and StartingVbo.
                    //

                    StartingVbo = ByteOffset;

                    StartingLbo = IoRuns[RunNumber].StartingLbo + StartingVbo;
                    StartingVbo = IoRuns[RunNumber].StartingVbo + StartingVbo;

                    //
                    //  If the file is compressed, then NtfsPrepareBuffers builds
                    //  an IoRuns array where it compresses contiguous Lcns, and
                    //  the Vcns do not always line up correctly.  But we know there
                    //  must be a corresponding Vcn for every Lcn in the stream,
                    //  and that that Vcn can only be >= to the Vcn we have just
                    //  calculated from the IoRuns array.  Therefore, since performance
                    //  of hotfix is not the issue here, we use the following simple
                    //  loop to sequentially scan the Mcb for a matching Vcn for
                    //  the current Lcn.
                    //

                    if (Scb->CompressionUnit != 0) {

                        VCN TempVcn;
                        LCN TempLcn, LcnOut;

                        TempLcn = LlClustersFromBytes( Vcb, StartingLbo );
                        TempVcn = LlClustersFromBytes( Vcb, StartingVbo );

                        //
                        //  Scan to the end of the Mcb (we assert below this
                        //  did not happen) or until we find a Vcn with the
                        //  Lcn we currently want to read.
                        //

                        while (NtfsLookupNtfsMcbEntry( &Scb->Mcb,
                                                       TempVcn,
                                                       &LcnOut,
                                                       NULL,
                                                       NULL,
                                                       NULL,
                                                       NULL,
                                                       NULL )

                                 &&

                               (LcnOut != TempLcn)) {

                            TempVcn = TempVcn + 1;
                        }

                        ASSERT(LcnOut == TempLcn);

                        StartingVbo = LlBytesFromClusters( Vcb, TempVcn );
                    }

                    LowFileRecord = (Scb == Vcb->MftScb) && (((PLARGE_INTEGER)&StartingVbo)->HighPart == 0);

                    //
                    //  Calculate the amount to actually read.
                    //


                    Length = IoRuns[RunNumber].ByteCount - ByteOffset;

                    if (Length > BytesPerCluster) {

                        Length = BytesPerCluster;
                    }

                    //
                    //  Loop while verify required, or we find we really
                    //  do not have an FT device.
                    //

                    while (TRUE) {

                        //
                        //  Create an associated IRP, making sure there is one stack entry for
                        //  us, as well.
                        //

                        Irp = IoMakeAssociatedIrp( MasterIrp, (CCHAR)(DeviceObject->StackSize + 1) );

                        if (Irp == NULL) {

                            //
                            //  We return the error status in the Master irp when
                            //  we were called.
                            //

                            MasterIrp->IoStatus.Status = IrpStatus;
                            return;
                        }

                        //
                        // Allocate and build a partial Mdl for the request.
                        //

                        Mdl = IoAllocateMdl( (PCHAR)MasterIrp->UserBuffer + IoRuns[RunNumber].BufferOffset + ByteOffset,
                                             Length,
                                             FALSE,
                                             FALSE,
                                             Irp );

                        if (Mdl == NULL) {

                            IoFreeIrp(Irp);

                            //
                            //  We return the error status in the Master irp when
                            //  we were called.
                            //

                            MasterIrp->IoStatus.Status = IrpStatus;
                            return;
                        }

                        //
                        //  Sanity Check
                        //

                        ASSERT( Mdl == Irp->MdlAddress );

                        IoBuildPartialMdl( MasterIrp->MdlAddress,
                                           Mdl,
                                           (PCHAR)MasterIrp->UserBuffer +
                                             IoRuns[RunNumber].BufferOffset + ByteOffset,
                                           Length );

                        //
                        //  Get the first IRP stack location in the associated Irp
                        //

                        IoSetNextIrpStackLocation( Irp );
                        IrpSp = IoGetCurrentIrpStackLocation( Irp );

                        //
                        //  Setup the Stack location to describe our read.
                        //

                        IrpSp->MajorFunction = IrpContext->MajorFunction;
                        IrpSp->Parameters.Read.Length = Length;
                        IrpSp->Parameters.Read.ByteOffset.QuadPart = StartingVbo;

                        //
                        // Set up the completion routine address in our stack frame.
                        //

                        IoSetCompletionRoutine( Irp,
                                                &NtfsMultiSyncCompletionRoutine,
                                                Context,
                                                TRUE,
                                                TRUE,
                                                TRUE );

                        //
                        //  Setup the next IRP stack location in the associated Irp for the disk
                        //  driver beneath us.
                        //

                        IrpSp = IoGetNextIrpStackLocation( Irp );

                        //
                        //  Setup the Stack location to do a normal read or write.
                        //

                        if ((IrpContext->MajorFunction == IRP_MJ_WRITE) || !SecondaryAvailable) {

                            IrpSp->MajorFunction = IrpContext->MajorFunction;
                            IrpSp->Flags = IrpSpFlags;
                            IrpSp->Parameters.Read.ByteOffset.QuadPart = StartingLbo;
                            IrpSp->Parameters.Read.Length = Length;

                        //
                        //  Otherwise we are supposed to read from the primary or secondary
                        //  on an FT drive.
                        //

                        } else {

                            IrpSp->MajorFunction = IRP_MJ_DEVICE_CONTROL;

                            if (FtCase != 1) {
                                IrpSp->Parameters.DeviceIoControl.IoControlCode = FT_PRIMARY_READ;
                            } else {
                                IrpSp->Parameters.DeviceIoControl.IoControlCode = FT_SECONDARY_READ;
                            }

                            Irp->AssociatedIrp.SystemBuffer = &SpecialRead;
                            SpecialRead.ByteOffset.QuadPart = StartingLbo;
                            SpecialRead.Length = Length;
                        }

                        //
                        //  We only need to set the associated IRP count in the master irp to
                        //  make it a master IRP.  But we set the count to one more than our
                        //  caller requested, because we do not want the I/O system to complete
                        //  the I/O.  We also set our own count.
                        //

                        Context->IrpCount = 1;
                        MasterIrp->AssociatedIrp.IrpCount = 2;

                        //
                        //  MtfsMultiCompletionRoutine only modifies the status on errors,
                        //  so we have to reset to success before each call.
                        //

                        MasterIrp->IoStatus.Status = STATUS_SUCCESS;

                        //
                        //  If IoCallDriver returns an error, it has completed the Irp
                        //  and the error will be caught by our completion routines
                        //  and dealt with as a normal IO error.
                        //

                        HotFixTrace(("Calling driver with Irp: %08lx\n", Irp));
                        KeClearEvent( &Context->Wait.SyncEvent );

                        (VOID)IoCallDriver( DeviceObject, Irp );

                        //
                        //  Now wait for it.
                        //

                        NtfsWaitSync( IrpContext );

                        HotFixTrace(("Request completion status: %08lx\n", MasterIrp->IoStatus.Status));

                        //
                        //  If we were so lucky to get a verify required, then
                        //  spin our wheels here a while.
                        //

                        if (MasterIrp->IoStatus.Status == STATUS_VERIFY_REQUIRED) {

                            //
                            //  Otherwise we need to verify the volume, and if it doesn't
                            //  verify correctly then we dismount the volume and report
                            //  our error.
                            //

                            if (!NtfsPerformVerifyOperation( IrpContext, Vcb )) {

                                //**** NtfsPerformDismountOnVcb( IrpContext, Vcb, TRUE, NULL );
                                ClearFlag( Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED );

                                MasterIrp->IoStatus.Status = STATUS_FILE_INVALID;
                                return;
                            }

                            //
                            //  The volume verified correctly so now clear the verify bit
                            //  and try and I/O again
                            //

                            ClearFlag( Vcb->Vpb->RealDevice->Flags, DO_VERIFY_VOLUME );

                        //
                        //  We may have assumed that there was a secondary available
                        //  and there is not.  We can only tell from getting this code.
                        //  Indicate there is no secondary and that we will be only
                        //  making one pass.
                        //

                        } else if (MasterIrp->IoStatus.Status == STATUS_INVALID_DEVICE_REQUEST) {

                            ASSERT((IrpContext->MajorFunction != IRP_MJ_WRITE) && SecondaryAvailable);

                            SetFlag(Vcb->VcbState, VCB_STATE_NO_SECONDARY_AVAILABLE);
                            SecondaryAvailable = FALSE;
                            FinalPass = TRUE;

                        //
                        //  If the secondary is offline then there is nothing to recover.
                        //

                       } else if (MasterIrp->IoStatus.Status == STATUS_FT_MISSING_MEMBER) {

                           //
                           //  FTDISK will return this error if they are in initialization.
                           //  Then we don't want to set VCB_STATE_NO_SECONDARY_AVAILABLE but
                           //  will need to check whether we really want to hotfix.
                           //

                           SecondaryAvailable = FALSE;
                           FinalPass = TRUE;

                        //
                        //  Otherwise we got success or another error and we should proceed.
                        //

                        } else {
                            break;
                        }
                    }

                    //
                    //  Check again if we really want to perform the hot-fix in the event the status
                    //  of the secondary has changed.
                    //

                    if (!SecondaryAvailable &&
                        (IrpContext->MajorFunction == IRP_MJ_READ) &&
                        (FlagOn( MasterIrp->Flags, IRP_PAGING_IO | IRP_NOCACHE ) == IRP_NOCACHE)) {

                        MasterIrp->IoStatus.Status = IrpStatus;
                        return;
                    }

                    if (!FT_SUCCESS(MasterIrp->IoStatus.Status)) {

                        BOOLEAN IsHotFixPage;

                        //
                        //  Calculate whether or not this is the hot fix thread itself
                        //  (i.e., executing NtfsPerformHotFix).
                        //

                        IsHotFixPage = NtfsIsTopLevelHotFixScb( Scb );

                        LlTemp1 = StartingVbo >> PAGE_SHIFT;                  //**** crock for x86 compiler bug
                        LlTemp2 = NtfsGetTopLevelHotFixVcn() >> PAGE_SHIFT;   //**** crock for x86 compiler bug

                        if (!IsHotFixPage ||
                            LlTemp1 != LlTemp2) {


                            IsHotFixPage = FALSE;
                        }

                        //
                        //  If the entire device manages to fail in the middle of this,
                        //  get out.
                        //

                        if (FsRtlIsTotalDeviceFailure(MasterIrp->IoStatus.Status)) {

                            MasterIrp->IoStatus.Status = IrpStatus;
                            return;
                        }

                        //
                        //  If this is not a write, fill the cluster with -1 for the
                        //  event that we ultimately never find good data.  This is
                        //  for security reasons (cannot show anyone the data that
                        //  happens to be in the buffer now), signature reasons (let
                        //  -1 designate read errors, as opposed to 0's which occur
                        //  on ValidDataLength cases), and finally if we fail to read
                        //  a bitmap, we must consider all clusters allocated if we
                        //  wish to continue to use the volume before chkdsk sees it.
                        //

                        if (IrpContext->MajorFunction == IRP_MJ_READ) {

                            NtfsFillIrpBuffer( IrpContext, MasterIrp, Length, IoRuns[RunNumber].BufferOffset + ByteOffset, 0xFF );

                            //
                            //  If this is file system metadata, then we better mark the
                            //  volume corrupt.
                            //

                            if (FinalPass &&
                                FlagOn(Scb->ScbState, SCB_STATE_MODIFIED_NO_WRITE) &&
                                (!LowFileRecord || (((ULONG)StartingVbo >= PAGE_SIZE) &&
                                                    ((ULONG)StartingVbo >= (ULONG)((VOLUME_DASD_NUMBER + 1) << Vcb->MftShift))))) {

                                NtfsPostVcbIsCorrupt( IrpContext, 0, NULL, NULL );
                            }

                            //
                            //  If this is a Usa-protected file, or the bitmap,
                            //  then we will try to procede with our 0xFF pattern
                            //  above rather than returning an error to our caller.
                            //  The Usa guy will get a Usa error, and the bitmap
                            //  will safely say that everything is allocated until
                            //  chkdsk can fix it up.
                            //

                            if (FlagOn(Scb->ScbState, SCB_STATE_USA_PRESENT) ||
                                (Scb == Vcb->BitmapScb)) {

                                MasterIrp->IoStatus.Status = STATUS_SUCCESS;
                            }
                        }

                        //
                        //  If we are not the page being hot fixed, we want to post the
                        //  hot fix and possibly remember the final status.
                        //

                        if (!IsHotFixPage) {

                            //
                            //  If we got a media error, post the hot fix now.  We expect
                            //  to post at most one hot fix in this routine.  When we post
                            //  it it will serialize on the current stream.  Do not attempt
                            //  hot fixes during restart, or if we do not have the bad
                            //  cluster file yet.
                            //

                            if (!FlagOn( Vcb->VcbState, VCB_STATE_RESTART_IN_PROGRESS ) &&
                                (Vcb->BadClusterFileScb != NULL) &&
                                (!LowFileRecord ||
                                 ((ULONG)StartingVbo >= Vcb->Mft2Scb->Header.FileSize.LowPart))) {

                                NtfsPostHotFix( MasterIrp,
                                                &StartingVbo,
                                                StartingLbo,
                                                BytesPerCluster,
                                                FALSE );
                            }

                            //
                            //  Now see if we ended up with an error on this cluster, and handle
                            //  it accordingly.
                            //
                            //  If we are the one actually trying to fix this error,
                            //  then we need to get success so that we can make the page
                            //  valid with whatever good data we have and flush data
                            //  to its new location.
                            //
                            //  Currently we will not try to figure out if the error
                            //  is actually on the Scb (not to mention the sector) that
                            //  we are hot fixing, assuming that the best thing is to
                            //  just try to charge on.
                            //


                            if (FinalPass) {

                                //
                                //  Make sure he gets the error (if we still have an
                                //  error (see above).
                                //

                                if (!FT_SUCCESS(MasterIrp->IoStatus.Status)) {
                                    FinalStatus = MasterIrp->IoStatus.Status;
                                }
                            }
                        }
                    }

                    //
                    //  If this is a Usa-protected stream, we now perform end of
                    //  Usa processing.  (Otherwise do end of cluster processing
                    //  below.)
                    //

                    if (FlagOn(Scb->ScbState, SCB_STATE_USA_PRESENT)) {

                        ULONG NextOffset = IoRuns[RunNumber].BufferOffset + ByteOffset + Length;

                        //
                        //  If we are not at the end of a Usa block, there is no work
                        //  to do now.
                        //

                        if ((NextOffset & (UsaBlockSize - 1)) == 0) {

                            HotFixTrace(("May be verifying UsaBlock\n"));

                            //
                            //  If the Usa block is ok, we may be able to knock the
                            //  corresponding sectors out of the ClustersToRecover mask.
                            //

                            if ((IrpContext->MajorFunction != IRP_MJ_READ) ||
                                 NtfsVerifyAndRevertUsaBlock( IrpContext,
                                                              Scb,
                                                              MasterIrp,
                                                              NULL,
                                                              NextOffset - UsaBlockSize,
                                                              UsaBlockSize,
                                                              StartingVbo - (UsaBlockSize - Length) )) {

                                //
                                //  If we are only fixing a Usa error anyway, or this is
                                //  the final pass or at least not the first pass, then
                                //  we can remove these clusters from the recover mask.
                                //

                                if (FixingUsaError || FinalPass || (FtCase != 0)) {

                                    ULONG ShiftCount = UsaBlockSize >> Vcb->ClusterShift;

                                    ClustersToRecover -= (ClusterMask * 2) -
                                                         (ClusterMask >> (ShiftCount - 1));
                                }

                            //
                            //  Note, that even if we get a Usa error, we want to
                            //  update the byte count on the final pass, because
                            //  our reader expects that.
                            //

                            } else if (FinalPass) {

                                HotFixTrace(("Verify may have failed\n"));
                            }
                        }

                    //
                    //  Perform end of cluster processing if not a Usa-protected stream.
                    //

                    } else {

                        //
                        //  If the read succeeded and this is the final pass or at least
                        //  not the first pass, we can take this cluster out of the cluster
                        //  to recover mask.
                        //

                        if (FT_SUCCESS(MasterIrp->IoStatus.Status) && (FinalPass || (FtCase != 0))) {

                            ClustersToRecover -= ClusterMask;
                        }
                    }
                }
            }
        }

        //
        //  Assume we terminated the inner loops because we hit a 32 cluster boundary,
        //  and advance our alignment points.
        //

        AlignedRunNumber = RunNumber;

        //
        //  We should have updated ByteOffset above (Prefast initialization).
        //

        ASSERT( ByteOffset != MAXULONG );

        AlignedByteOffset = ByteOffset;

    } while (RunNumber < MultipleIrpCount);

    //
    //  Now put the final status in the MasterIrp and return
    //

    MasterIrp->IoStatus.Status = FinalStatus;
    if (!NT_SUCCESS(FinalStatus)) {
        MasterIrp->IoStatus.Information = 0;
    }

    HotFixTrace(("NtfsFixDataError returning IoStatus = %08lx, %08lx\n",
                 MasterIrp->IoStatus.Status,
                 MasterIrp->IoStatus.Information));

    return;
}


VOID
NtfsPostHotFix (
    IN PIRP Irp,
    IN PLONGLONG BadVbo,
    IN LONGLONG BadLbo,
    IN ULONG ByteLength,
    IN BOOLEAN DelayIrpCompletion
    )

/*

Routine Description:

    This routine posts a hot fix request to a worker thread.  It has to be posted,
    because we cannot expect to be able to acquire the resources we need exclusive
    when the bad cluster is discovered.

Arguments:

    Irp - The Irp for a read or write request which got the error

    BadVbo - The Vbo of the bad cluster for the read or write request

    BadLbo - The Lbo of the bad cluster

    ByteLength - Length to hot fix

    DelayIrpCompletion - TRUE if the Irp should not be completed until the hot
                         fix is done.

Return Value:

    None

--*/

{
    PIRP_CONTEXT HotFixIrpContext = NULL;
    PVOLUME_DEVICE_OBJECT VolumeDeviceObject;
    PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
    PFILE_OBJECT FileObject = IrpSp->FileObject;

    HotFixTrace(("NTFS: Posting hotfix on file object: %08lx\n", FileObject));

    //
    //  Allocate an IrpContext to post the hot fix to a worker thread.
    //

    NtfsInitializeIrpContext( Irp, FALSE, &HotFixIrpContext );

    //
    //  First reference the file object so that it will not go away
    //  until the hot fix is done.  (We cannot increment the CloseCount
    //  in the Scb, since we are not properly synchronized.)
    //

    ObReferenceObject( FileObject );

    HotFixIrpContext->OriginatingIrp = (PIRP)FileObject;
    HotFixIrpContext->ScbSnapshot.AllocationSize = *BadVbo;
    HotFixIrpContext->ScbSnapshot.FileSize = BadLbo;
    ((ULONG)HotFixIrpContext->ScbSnapshot.ValidDataLength) = ByteLength;
    if (DelayIrpCompletion) {

#ifdef _WIN64

        //
        // (fcf) The IrpToComplete pointer is stashed into the high half of a
        // LONGLONG.  This is problematic on WIN64, so we have to store it
        // somewhere else on 64-bit platforms.  IrpContext->SharedScb is unused
        // in this codepath (asserted below), so we'll use that.
        //
        // Its possible that this change could be made for 32-bit platforms as
        // well, if only to avoid this conditional compilation, but I would
        // prefer the original authors to sanity-check this first.
        //
        // See also NtfsPerformHotFix() where this pointer is extracted.
        //

        ASSERT(HotFixIrpContext->SharedScbSize == 0);
        ASSERT(HotFixIrpContext->SharedScb == NULL);

        (PIRP)HotFixIrpContext->SharedScb = Irp;

#else // !_WIN64

        ((PLARGE_INTEGER)&HotFixIrpContext->ScbSnapshot.ValidDataLength)->HighPart = (ULONG)Irp;

#endif // _WIN64

    } else {
        ((PLARGE_INTEGER)&HotFixIrpContext->ScbSnapshot.ValidDataLength)->HighPart = 0;
    }

    //
    //  Locate the volume device object and Vcb that we are trying to access
    //

    VolumeDeviceObject = (PVOLUME_DEVICE_OBJECT)IrpSp->DeviceObject;
    HotFixIrpContext->Vcb = &VolumeDeviceObject->Vcb;

    //
    //  Send it off.....
    //

    RtlZeroMemory( &HotFixIrpContext->WorkQueueItem, sizeof( WORK_QUEUE_ITEM ) );
    ExInitializeWorkItem( &HotFixIrpContext->WorkQueueItem,
                          (PWORKER_THREAD_ROUTINE)NtfsPerformHotFix,
                          (PVOID)HotFixIrpContext );

    ExQueueWorkItem( &HotFixIrpContext->WorkQueueItem, CriticalWorkQueue );
}


VOID
NtfsPerformHotFix (
    IN PIRP_CONTEXT IrpContext
    )

/*++

Routine Description:

    This routine implements implements a hot fix that was scheduled
    above, extracting its parameters from the IrpContext initialized
    above.  The hot fix must be for a contiguous range of Lcns (usually 1).

Arguments:

    IrpContext - Supplies the IrpContext with the hot fix information

Return Value:

    None.

--*/

{
    TOP_LEVEL_CONTEXT TopLevelContext;
    PTOP_LEVEL_CONTEXT ThreadTopLevelContext;

    ATTRIBUTE_ENUMERATION_CONTEXT Context;
    TYPE_OF_OPEN TypeOfOpen;
    PVCB Vcb;
    PFCB Fcb;
    PSCB Scb;
    PCCB Ccb;
    PSCB BadClusterScb;
    VCN BadVcn;
    LCN LcnTemp, BadLcn;
    LONGLONG ClusterCount;
    NTSTATUS Status;
    PVOID Buffer;
    PIRP IrpToComplete;
    ULONG ClustersToFix;
    PBCB Bcb = NULL;
    ERESOURCE_THREAD BcbOwner = 0;
    BOOLEAN PerformFullCleanup = TRUE;
    NTSTATUS CompletionStatus = STATUS_SUCCESS;
    PSCB OriginalScb = NULL;
    PSCB NewScb = NULL;
    BOOLEAN PagingFile;

    //
    //  Extract a description of the cluster to be fixed.
    //

    PFILE_OBJECT FileObject = (PFILE_OBJECT)IrpContext->OriginatingIrp;
    VBO BadVbo = *(PVBO)&IrpContext->ScbSnapshot.AllocationSize;

    PAGED_CODE();

    //
    //  Reset the shared fields
    //

    InitializeListHead( &IrpContext->RecentlyDeallocatedQueue );
    InitializeListHead( &IrpContext->ExclusiveFcbList );

    ThreadTopLevelContext = NtfsInitializeTopLevelIrp( &TopLevelContext, TRUE, FALSE );
    ASSERT( ThreadTopLevelContext == &TopLevelContext );
    ASSERT( FlagOn( IrpContext->State, IRP_CONTEXT_STATE_ALLOC_FROM_POOL ));

    NtfsUpdateIrpContextWithTopLevel( IrpContext, ThreadTopLevelContext );

    //
    //  Initialize our local variables
    //

    TypeOfOpen = NtfsDecodeFileObject( IrpContext, FileObject, &Vcb, &Fcb, &Scb, &Ccb, FALSE );
    BadClusterScb = Vcb->BadClusterFileScb;
    BadVcn = LlClustersFromBytesTruncate( Vcb, BadVbo );
    BadLcn = LlClustersFromBytesTruncate( Vcb, IrpContext->ScbSnapshot.FileSize );
    ClustersToFix = ClustersFromBytes( Vcb, ((ULONG)IrpContext->ScbSnapshot.ValidDataLength) );

#ifdef _WIN64

    //
    // See comments in NtfsPostHotFix() regarding the location of IrpToComplete.
    //

    ASSERT(IrpContext->SharedScbSize == 0);
    IrpToComplete = (PIRP)IrpContext->SharedScb;

    //
    // Reset SharedScb back to NULL just to be safe.
    //

    IrpContext->SharedScb = NULL;

#else // !_WIN64

    IrpToComplete = (PIRP)(((PLARGE_INTEGER)&IrpContext->ScbSnapshot.ValidDataLength)->HighPart);

#endif

    //
    //  Remember the status to complete the original Irp with.
    //

    if (IrpToComplete != NULL) {

        CompletionStatus = IrpToComplete->IoStatus.Status;
    }

    NtfsInitializeAttributeContext( &Context );

    //
    //  Set up for synchronous operation
    //

    SetFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT );

    //
    //  Show that we are performing a HotFix.  Note we are not processing
    //  an Irp now.
    //

    IrpContext->OriginatingIrp = NULL;

    TopLevelContext.VboBeingHotFixed = BadVbo;
    TopLevelContext.ScbBeingHotFixed = Scb;

    //
    //  Acquire the Vcb before acquiring the paging Io resource.
    //

    NtfsAcquireExclusiveVcb( IrpContext, Vcb, TRUE );
    ASSERT( 1 == ExIsResourceAcquiredSharedLite( &Vcb->Resource ) );

    //
    //  While we're holding the Vcb, let's make sure the volume is still mounted.
    //  If it isn't mounted, we need to clean up and get out.
    //

    if (!FlagOn( Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED )) {

        NtfsCleanupAttributeContext( IrpContext, &Context );
        NtfsReleaseVcb( IrpContext, Vcb );

        NtfsCompleteRequest( IrpContext, IrpToComplete, CompletionStatus );
        return;
    }

    //
    //  Acquire the paging io resource for this Fcb if it exists.
    //

    if (Scb->Header.PagingIoResource != NULL) {

        NtfsAcquireExclusivePagingIo( IrpContext, Fcb );
    }

    //
    //  Just because we are hot fixing one file, it is possible that someone
    //  will log to another file and try to lookup Lcns.  So we will acquire
    //  all files.  Example:  Hot fix is in Mft, and SetFileInfo has only the
    //  file acquired, and will log something to the Mft, and cause Lcns to be
    //  looked up.
    //

    NtfsAcquireAllFiles( IrpContext, Vcb, TRUE, FALSE, FALSE );

    //
    //  For the bitmap - acquire again to explicitly get it on the exclsuive list
    //  and release the initial acquire
    //

    if (Scb == Vcb->BitmapScb) {

        ASSERT( NtfsIsExclusiveScb( Scb ) && (NtfsIsSharedScb( Scb ) == 1));

        NtfsAcquireExclusiveFcb( IrpContext, Scb->Fcb, Scb, ACQUIRE_HOLD_BITMAP );
        NtfsReleaseResource( IrpContext, Fcb );

        ASSERT( NtfsIsExclusiveScb( Scb ) && (NtfsIsSharedScb( Scb ) == 1) &&
                (Scb->Fcb->ExclusiveFcbLinks.Flink != NULL));
    }

    //
    //  Don't attempt to hotfix if the scb is deleted
    //

    if (!FlagOn( Scb->ScbState, SCB_STATE_ATTRIBUTE_DELETED )) {

        //
        //  Catch all exceptions.  Note, we should not get any I/O error exceptions
        //  on our device.
        //

        try {

            PagingFile = FlagOn( Fcb->FcbState, FCB_STATE_PAGING_FILE ) && FlagOn( Scb->ScbState, SCB_STATE_UNNAMED_DATA );

            //
            //  Hotfixing the paging file is tricky because paging file i/o acquires no resources
            //  So we create a shadow scb to do the work in
            //

            if (PagingFile) {

                UNICODE_STRING Mirror;
                BOOLEAN Existing;
                VCN Vcn;
                LCN Lcn;

#ifdef BENL_DBG
                KdPrint(( "NTFS: hotfixing pagefile\n "));
#endif

                Mirror.Length = Mirror.MaximumLength = 12;
                Mirror.Buffer = L"Mirror";

                NewScb = NtfsCreateScb( IrpContext, Scb->Fcb, $DATA, &Mirror, FALSE, &Existing );
                ASSERT( Existing == FALSE );
                ASSERT( FlagOn( NewScb->ScbState, SCB_STATE_NONPAGED ));

                //
                //  Null out the name so we think it points to real unnamed $data
                //

                NewScb->AttributeName.Length = 0;

                //
                //  Now update the mirror from the attribute to get the header info and
                //  snapshot it
                //

                NtfsUpdateScbFromAttribute( IrpContext, NewScb, NULL );
                NtfsSnapshotScb( IrpContext, NewScb );

                //
                //  Load the real scb's mcb cluster info into the mirror
                //

                for (Vcn = 0; Vcn < LlClustersFromBytes( Vcb, Scb->Header.AllocationSize.QuadPart ); Vcn += ClusterCount ) {

                    if (NtfsLookupNtfsMcbEntry( &Scb->Mcb, Vcn, &Lcn, &ClusterCount, NULL, NULL, NULL, NULL )) {

                        NtfsAddNtfsMcbEntry( &NewScb->Mcb, Vcn, Lcn, ClusterCount, FALSE );
                    } else {

                        ASSERTMSG( "Missing range in paging file.\n", FALSE );
                        break;
                    }
                }

                OriginalScb = Scb;
                Scb = NewScb;
            }

            for (; ClustersToFix != 0; ClustersToFix -= 1) {

                //
                //  Lookup the bad cluster to see if it is already in the bad cluster
                //  file, and do nothing if it is.
                //

                if (!NtfsLookupAllocation( IrpContext,
                                           BadClusterScb,
                                           BadLcn,
                                           &LcnTemp,
                                           &ClusterCount,
                                           NULL,
                                           NULL ) &&

                    NtfsLookupAllocation( IrpContext,
                                          Scb,
                                          BadVcn,
                                          &LcnTemp,
                                          &ClusterCount,
                                          NULL,
                                          NULL ) &&

                    (LcnTemp == BadLcn)) {

                    //
                    //  Pin the bad cluster in memory, so that we will not lose whatever data
                    //  we have for it.  (This data will be the correct data if we are talking
                    //  to the FT driver or got a write error, otherwise it may be all -1's.)
                    //
                    //  Do not try to do this if we are holding on to the original Irp, as that
                    //  will cause a collided page wait deadlock.
                    //

                    if (IrpToComplete == NULL) {

                        ULONG Count = 100;

                        NtfsCreateInternalAttributeStream( IrpContext,
                                                           Scb,
                                                           FALSE,
                                                           &NtfsInternalUseFile[PERFORMHOTFIX_FILE_NUMBER] );

                        //
                        //  We loop as long as we get an data error.  We want our
                        //  thread to read from the disk because we will recognize
                        //  an I/O request started in PerformHotFix and ignore the
                        //  data error.  The cases where we do get an error will
                        //  probably be from Mm intercepting this request because
                        //  of a collided read with another thread.
                        //


                        do {

                            Status = STATUS_SUCCESS;

                            try {

                                NtfsPinStream( IrpContext, Scb, BadVbo, Vcb->BytesPerCluster, &Bcb, &Buffer );

                            } except ((!FsRtlIsNtstatusExpected( Status = GetExceptionCode())
                                       || FsRtlIsTotalDeviceFailure( Status ))
                                      ? EXCEPTION_CONTINUE_SEARCH
                                      : EXCEPTION_EXECUTE_HANDLER) {

                                NOTHING;
                            }

                        } while (Count-- && (Status != STATUS_SUCCESS));

                        if (Status != STATUS_SUCCESS) {

                            NtfsRaiseStatus( IrpContext, Status, NULL, NULL );
                        }
                    }

                    //
                    //  If we're hotfixing the logfile set the owner bcb owner to thread & 0x1 so
                    //  we don't run into trouble if the logged changes to it use the same page
                    //  Lfs will also set the bcb owner and our release will fail because the threadowner
                    //  has been changed
                    //

                    if (Scb == Vcb->LogFileScb) {

                        BcbOwner = (ERESOURCE_THREAD) (((ULONG_PTR) PsGetCurrentThread()) | 1);

                        CcSetBcbOwnerPointer( Bcb, (PVOID)BcbOwner );
                    }

                    //
                    //  Now deallocate the bad cluster in this stream in the bitmap only,
                    //  since in general we do not support sparse deallocation in the file
                    //  record.  We will update the allocation below.
                    //

    #if DBG
                    KdPrint(("NTFS:     Freeing Bad Vcn: %08lx, %08lx\n", ((ULONG)BadVcn), ((PLARGE_INTEGER)&BadVcn)->HighPart));
    #endif

                    //
                    //   Deallocate clusters directly - so the change is only in memory
                    //   Because we're not using the normal NtfsDeleteAllocation its necc. to
                    //   manually create the snapshots that will correctly unload the modified range in
                    //   case of a raise
                    //

                    NtfsSnapshotScb( IrpContext, Scb );

                    if (BadVcn < Scb->ScbSnapshot->LowestModifiedVcn) {

                        Scb->ScbSnapshot->LowestModifiedVcn = BadVcn;
                    }

                    if (BadVcn > Scb->ScbSnapshot->HighestModifiedVcn) {

                        Scb->ScbSnapshot->HighestModifiedVcn = BadVcn;
                    }

                    NtfsDeallocateClusters( IrpContext,
                                            Vcb,
                                            Scb,
                                            BadVcn,
                                            BadVcn,
                                            &Scb->TotalAllocated );
                    //
                    //  Look up the bad cluster attribute.
                    //

                    NtfsLookupAttributeForScb( IrpContext, BadClusterScb, NULL, &Context );

                    //
                    //  Now append this cluster to the bad cluster file
                    //

    #if DBG
                    KdPrint(("NTFS:     Retiring Bad Lcn: %08lx, %08lx\n", ((ULONG)BadLcn), ((PLARGE_INTEGER)&BadLcn)->HighPart));
    #endif

                    NtfsAddBadCluster( IrpContext, Vcb, BadLcn );

                    //
                    //  Now update the file record for the bad cluster file to
                    //  show the new cluster.
                    //

                    NtfsAddAttributeAllocation( IrpContext,
                                                BadClusterScb,
                                                &Context,
                                                &BadLcn,
                                                (PVCN)&Li1 );

                    //
                    //  Now reallocate a cluster to the original stream to replace the bad cluster.
                    //

                    HotFixTrace(("NTFS:     Reallocating Bad Vcn\n"));
                    NtfsAddAllocation( IrpContext, NULL, Scb, BadVcn, (LONGLONG)1, FALSE, NULL );

                    //
                    //  Unpin the pages now so that the flush won't block if we are hot-fixing the Mft.
                    //

                    NtfsCleanupAttributeContext( IrpContext, &Context );

                    //
                    //  Now that there is a new home for the data, mark the page dirty, unpin
                    //  it and flush it out to its new home.
                    //

                    if (IrpToComplete == NULL) {

                        LONGLONG BiasedBadVbo = BadVbo;

                        CcSetDirtyPinnedData( Bcb, NULL );

                        if (Scb != Vcb->LogFileScb) {
                            NtfsUnpinBcb( IrpContext, &Bcb );
                        } else {
                            NtfsUnpinBcbForThread( IrpContext, &Bcb, BcbOwner );
                        }

                        //
                        //  Flush the stream.  Ignore the status - if we get something like
                        //  a log file full, the Lazy Writer will eventually write the page.
                        //  Bias the write if this is the Usn Journal.
                        //

                        if (FlagOn( Scb->ScbPersist, SCB_PERSIST_USN_JOURNAL )) {

                            BiasedBadVbo -= Scb->Vcb->UsnCacheBias;
                        }

#ifdef _WIN64
                        //
                        //  Currently, we cannot hotfix the $logfile on ia64 as the
                        //  flush below will cause an AV due to NtfsCheckWriteRange not
                        //  not capable of handling a call all the way from this routine
                        //  as the last flush file offset can be very different from
                        //  the bad vcn file offset.  Instead we let someone else
                        //  do the flush.  The $logfile data will be bad and drive may
                        //  get mark dirty but we will recover on the next round
                        //  as the bad cluster would have been replaced.
                        //

                        if (Scb != Vcb->LogFileScb) {
#endif

                            (VOID)NtfsFlushUserStream( IrpContext, Scb, &BiasedBadVbo, 1 );

#ifdef _WIN64
                        }
#endif

                    }

                    //
                    //  Commit the transaction.
                    //

                    NtfsCommitCurrentTransaction( IrpContext );

                    //
                    //  Now that the data is flushed to its new location, we will write the
                    //  hot fix record.  We don't write the log record if we are
                    //  fixing the logfile.  Instead we explicitly flush the Mft record
                    //  for the log file.  The log file is one file where we expect
                    //  to be able to read the mapping pairs on restart.
                    //

                    if (Scb == Vcb->LogFileScb) {

                        if (Vcb->MftScb->FileObject != NULL) {

                            CcFlushCache( &Vcb->MftScb->NonpagedScb->SegmentObject,
                                          &Li0,
                                          Vcb->BytesPerFileRecordSegment * ATTRIBUTE_DEF_TABLE_NUMBER,
                                          NULL );
                        }

                    } else {

                        (VOID) NtfsWriteLog( IrpContext,
                                             Scb,
                                             NULL,
                                             HotFix,
                                             NULL,
                                             0,
                                             Noop,
                                             NULL,
                                             0,
                                             LlBytesFromClusters( Vcb, BadVcn ),
                                             0,
                                             0,
                                             Vcb->BytesPerCluster );

                        //
                        //  And we have to commit that one, too.
                        //

                        NtfsCommitCurrentTransaction( IrpContext );
                    }

                    //
                    //  Now flush the log to insure that the hot fix gets remembered,
                    //  especially important if this is the paging file.
                    //

                    LfsFlushToLsn( Vcb->LogHandle, LiMax );

                    HotFixTrace(("NTFS:     Bad Cluster replaced\n"));
                }

                //
                //  Get ready for another possible pass through the loop
                //

                BadVcn = BadVcn + 1;
                BadLcn = BadLcn + 1;

                ASSERT( NULL == Bcb );
            }

            //
            //  Move the in memory allocation from the mirror of the paging file
            //  back to the real scb in an atomic matter
            //

            if (NewScb != NULL) {

                NtfsSwapMcbs( &NewScb->Mcb, &OriginalScb->Mcb );
                NtfsDeleteScb( IrpContext, &NewScb );
                Scb = OriginalScb;
            }

        } except(NtfsExceptionFilter( IrpContext, GetExceptionInformation() )) {

            NTSTATUS ExceptionCode = GetExceptionCode();

            //
            //  We are not prepared to have our IrpContext requeued, so just
            //  consider these cases to be bad luck.  We will put a status of
            //  data error in the irp context and pass that code to the process
            //  exception routine.
            //

            if ((ExceptionCode == STATUS_LOG_FILE_FULL) ||
                (ExceptionCode == STATUS_CANT_WAIT)) {

                ExceptionCode = IrpContext->ExceptionStatus = STATUS_DATA_ERROR;
            }

            //
            //  We won't be calling ReleaseAllFiles.  Decrement the Acquire count
            //  before releasing the Fcbs.
            //

            ASSERT( Vcb->AcquireFilesCount != 0 );
            Vcb->AcquireFilesCount -= 1;

            //
            //  Cleanup the temporary mirror scb (if there is one) while we have an
            //  irpcontext
            //

            if (NewScb != NULL) {
                NtfsDeleteScb( IrpContext, &NewScb );
                Scb = OriginalScb;
            }

            NtfsProcessException( IrpContext, NULL, ExceptionCode );

            //
            //  The IrpContext is really gone now.
            //

            IrpContext = NULL;
            PerformFullCleanup = FALSE;

            ASSERT( IoGetTopLevelIrp() != (PIRP) &TopLevelContext );
        }
    }

    //
    //  Let any errors be handled in the except clause above, however we
    //  cleanup on the way out, because for example we need the IrpContext
    //  still in the except clause.
    //

    try {

        NtfsCleanupAttributeContext( IrpContext, &Context );

        if (Scb != Vcb->LogFileScb) {
            NtfsUnpinBcb( IrpContext, &Bcb );
        } else {
            NtfsUnpinBcbForThread( IrpContext, &Bcb, BcbOwner );
        }

        //
        //  If we aborted this operation then all of the file resources have
        //  already been released.
        //

        if (PerformFullCleanup) {

            NtfsReleaseAllFiles( IrpContext, Vcb, FALSE );

            NtfsReleaseVcb( IrpContext, Vcb );

        //
        //  The files have been released but not the Vcb or the volume bitmap.
        //

        } else {

            if ((Vcb->BitmapScb != NULL) &&  NtfsIsExclusiveScb( Vcb->BitmapScb )) {

                NtfsReleaseResource( IrpContext, Vcb->BitmapScb );
            }

            //
            //  We need to release the Vcb twice since we specifically acquire
            //  it once and then again with all the files.
            //

            NtfsReleaseVcb( IrpContext, Vcb );
            NtfsReleaseVcb( IrpContext, Vcb );
        }

        ObDereferenceObject( FileObject );

        //
        //  The IrpContext and Irp will already be NULL if they have been completed already.
        //

        NtfsCompleteRequest( IrpContext, IrpToComplete, CompletionStatus );

    } except(EXCEPTION_EXECUTE_HANDLER) {
        NOTHING;
    }

    ASSERT( IoGetTopLevelIrp() != (PIRP) &TopLevelContext );
}


BOOLEAN
NtfsGetReservedBuffer (
    IN PFCB ThisFcb,
    OUT PVOID *Buffer,
    OUT PULONG Length,
    IN UCHAR Need2
    )

/*++

Routine Description:

    This routine allocates the reserved buffers depending on the needs of
    the caller.  If the caller might require two buffers then we will allocate
    buffers 1 or 2.  Otherwise we can allocate any of the three.

Arguments:

    ThisFcb - This is the Fcb where the io is occurring.

    Buffer - Address to store the address of the allocated buffer.

    Length - Address to store the length of the returned buffer.

    Need2 - Zero if only one buffer needed.  Either 1 or 2 if two buffers
        might be needed.  Buffer 2 can be acquired recursively.  If buffer
        1 is needed and the current thread already owns buffer 1 then
        grant buffer three instead.

Return Value:

    BOOLEAN - Indicates whether the buffer was acquired.

--*/

{
    BOOLEAN Allocated = FALSE;
    PVOID CurrentThread;

    //
    //  Capture the current thread and the Fcb for the file we are acquiring
    //  the buffer for.
    //

    CurrentThread = (PVOID) PsGetCurrentThread();

    ExAcquireFastMutexUnsafe( &NtfsReservedBufferMutex );

    //
    //  If we need two buffers then allocate either buffer 1 or buffer 2.
    //  We allow this caller to get a buffer if
    //
    //      - He already owns one of these buffers   (or)
    //
    //      - Neither of the 2 buffers are allocated (and)
    //      - No other thread has a buffer on behalf of this file
    //

    if (Need2) {

        if ((NtfsReservedBufferThread == CurrentThread) ||

            (!FlagOn( NtfsReservedInUse, 3 ) &&
             ((NtfsReserved3Fcb != ThisFcb) ||
              (NtfsReserved3Thread == CurrentThread)))) {

            NtfsReservedBufferThread = CurrentThread;
            NtfsReserved12Fcb = ThisFcb;

            //
            //  Check whether the caller wants buffer 1 or buffer 2.
            //

            if (Need2 == RESERVED_BUFFER_TWO_NEEDED) {

                //
                //  If we don't own buffer 1 then reserve it now.
                //

                if (!FlagOn( NtfsReservedInUse, 1 )) {

                    NtfsReserved1Thread = CurrentThread;
                    SetFlag( NtfsReservedInUse, 1 );
                    *Buffer = NtfsReserved1;
                    *Length = LARGE_BUFFER_SIZE;
                    Allocated = TRUE;

                } else if (!FlagOn( NtfsReservedInUse, 4 )) {

                    NtfsReserved3Fcb = ThisFcb;

                    NtfsReserved3Thread = CurrentThread;
                    SetFlag( NtfsReservedInUse, 4 );
                    *Buffer = NtfsReserved3;
                    *Length = LARGE_BUFFER_SIZE;
                    Allocated = TRUE;
                }

            } else {

                ASSERT( Need2 == RESERVED_BUFFER_WORKSPACE_NEEDED );

                NtfsReserved2Thread = CurrentThread;
                SetFlag( NtfsReservedInUse, 2 );
                *Buffer = NtfsReserved2;
                *Length = WORKSPACE_BUFFER_SIZE;
                NtfsReserved2Count += 1;
                Allocated = TRUE;
            }
        }

    //
    //  We only need 1 buffer.  If this thread is the exclusive owner then
    //  we know it is safe to use buffer 2.  The data in this buffer doesn't
    //  need to be preserved across a recursive call.
    //

    } else if (NtfsReservedBufferThread == CurrentThread) {

        NtfsReserved2Thread = CurrentThread;
        SetFlag( NtfsReservedInUse, 2 );
        *Buffer = NtfsReserved2;
        *Length = LARGE_BUFFER_SIZE;
        NtfsReserved2Count += 1;
        Allocated = TRUE;

    //
    //  We only need 1 buffer.  Try for buffer 3 first.
    //

    } else if (!FlagOn( NtfsReservedInUse, 4)) {

        //
        //  Check if the owner of the first two buffers is operating in the
        //  same file but is a different thread.  We can't grant another buffer
        //  for a different stream in the same file.
        //

        if (ThisFcb != NtfsReserved12Fcb) {

            NtfsReserved3Fcb = ThisFcb;

            NtfsReserved3Thread = CurrentThread;
            SetFlag( NtfsReservedInUse, 4 );
            *Buffer = NtfsReserved3;
            *Length = LARGE_BUFFER_SIZE;
            Allocated = TRUE;
        }

    //
    //  If there is no exclusive owner then we can use either of the first
    //  two buffers.  Note that getting one of the first two buffers will
    //  lock out the guy who needs two buffers.
    //

    } else if (NtfsReservedBufferThread == NULL) {

        if (!FlagOn( NtfsReservedInUse, 2 )) {

            NtfsReserved2Thread = CurrentThread;
            SetFlag( NtfsReservedInUse, 2 );
            *Buffer = NtfsReserved2;
            *Length = LARGE_BUFFER_SIZE;
            NtfsReserved2Count += 1;
            Allocated = TRUE;

        } else if (!FlagOn( NtfsReservedInUse, 1 )) {

            NtfsReserved1Thread = CurrentThread;
            SetFlag( NtfsReservedInUse, 1 );
            *Buffer = NtfsReserved1;
            *Length = LARGE_BUFFER_SIZE;
            Allocated = TRUE;
        }
    }

    ExReleaseFastMutexUnsafe(&NtfsReservedBufferMutex);
    return Allocated;
}

BOOLEAN
NtfsFreeReservedBuffer (
    IN PVOID Buffer
    )
{
    BOOLEAN Deallocated = FALSE;

    ExAcquireFastMutexUnsafe(&NtfsReservedBufferMutex);

    if (Buffer == NtfsReserved1) {
        ASSERT( FlagOn( NtfsReservedInUse, 1 ));

        ClearFlag( NtfsReservedInUse, 1 );
        NtfsReserved1Thread = NULL;
        if (!FlagOn( NtfsReservedInUse, 2)) {
            NtfsReservedBufferThread = NULL;
            NtfsReserved12Fcb = NULL;
        }

        Deallocated = TRUE;

    } else if (Buffer == NtfsReserved2) {
        ASSERT( FlagOn( NtfsReservedInUse, 2 ));

        NtfsReserved2Count -= 1;

        if (NtfsReserved2Count == 0) {

            ClearFlag( NtfsReservedInUse, 2 );
            NtfsReserved2Thread = NULL;
            if (!FlagOn( NtfsReservedInUse, 1)) {
                NtfsReservedBufferThread = NULL;
                NtfsReserved12Fcb = NULL;
            }
        }

        Deallocated = TRUE;

    } else if (Buffer == NtfsReserved3) {
        ASSERT( FlagOn( NtfsReservedInUse, 4 ));
        ClearFlag( NtfsReservedInUse, 4 );
        Deallocated = TRUE;
        NtfsReserved3Thread = NULL;
        NtfsReserved3Fcb = NULL;
    }

    ExReleaseFastMutexUnsafe(&NtfsReservedBufferMutex);
    return Deallocated;
}


NTSTATUS
NtfsDefragFile (
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp
    )

/*++

Routine Description:

    Direct defrag.  This routines modifies the input buffer to track progress. So the
    FSCTL must always be buffered.


Arguments:

    Irp - Supplies the Irp being processed.

Return Value:

    NTSTATUS - The return status for the operation.

--*/

{
    NTSTATUS Status;
    PIO_STACK_LOCATION IrpSp;
    PIO_STACK_LOCATION NextIrpSp;
    ULONG FsControlCode;

    PFILE_OBJECT FileObject;
    TYPE_OF_OPEN TypeOfOpen;
    PVCB Vcb;
    PFCB Fcb;
    PSCB Scb;
    PCCB Ccb;

    ATTRIBUTE_ENUMERATION_CONTEXT AttrContext;

#if defined( _WIN64 )
    MOVE_FILE_DATA MoveDataLocal;
#endif

    PMOVE_FILE_DATA MoveData;

    LONGLONG FileOffset;
    PMDL Mdl = NULL;
    BOOLEAN AcquiredScb = FALSE;
    BOOLEAN AcquiredAllFiles = FALSE;
    BOOLEAN AcquiredVcb = FALSE;
    ULONG DeletePendingFailureCountsLeft;

    extern POBJECT_TYPE *IoFileObjectType;

    PVOID Buffer = NULL;
    ULONG BufferLength;
    NTFS_IO_CONTEXT NtfsIoContext;
    BOOLEAN AcquiredBitmap = FALSE;
    BOOLEAN AcquiredMft = FALSE;
    BOOLEAN FreeRecentlyDeallocated = FALSE;
    BOOLEAN IoctlSupported = TRUE;

    PAGED_CODE( );

    //
    //  Always make this synchronous for MoveFile
    //  We should never be in the FSP for this.  Otherwise the user handle
    //  is invalid.  Also disable quota accounting since defrag doesn't affect it
    //  Otherwise we might trigger it while moving attributes around due to mapping pair
    //  changes and deadlock
    //

    SetFlag( IrpContext->State, IRP_CONTEXT_STATE_WAIT | IRP_CONTEXT_STATE_QUOTA_DISABLE );
    ASSERT( !FlagOn( IrpContext->State, IRP_CONTEXT_STATE_IN_FSP ));

    //
    //  Get the current Irp stack location and save some references.
    //

    IrpSp = IoGetCurrentIrpStackLocation( Irp );
    NextIrpSp = IoGetNextIrpStackLocation( Irp );
    FsControlCode = IrpSp->Parameters.FileSystemControl.FsControlCode;

    DebugTrace( +1, Dbg, ("NtfsMoveFile, FsControlCode = %08lx\n", FsControlCode) );

    //
    //  Extract and decode the file object and check for type of open.
    //

    TypeOfOpen = NtfsDecodeFileObject( IrpContext, IrpSp->FileObject, &Vcb, &Fcb, &Scb, &Ccb, TRUE );

    if ((Ccb == NULL) || !FlagOn( Ccb->AccessFlags, MANAGE_VOLUME_ACCESS )) {

        NtfsCompleteRequest( IrpContext, Irp, STATUS_ACCESS_DENIED );
        return STATUS_ACCESS_DENIED;
    }

#if defined(_WIN64)

    //
    //  Win32/64 thunking code
    //

    if (IoIs32bitProcess( Irp )) {

        PMOVE_FILE_DATA32 MoveData32;

        if (IrpSp->Parameters.FileSystemControl.InputBufferLength < sizeof( MOVE_FILE_DATA32 )) {

            NtfsCompleteRequest( IrpContext, Irp, STATUS_BUFFER_TOO_SMALL );
            return STATUS_BUFFER_TOO_SMALL;
        }

        MoveData32 = (PMOVE_FILE_DATA32) Irp->AssociatedIrp.SystemBuffer;
        MoveDataLocal.ClusterCount = MoveData32->ClusterCount;
        MoveDataLocal.FileHandle = (HANDLE)(ULONG_PTR)(LONG) MoveData32->FileHandle;
        MoveDataLocal.StartingLcn.QuadPart = MoveData32->StartingLcn.QuadPart;
        MoveDataLocal.StartingVcn.QuadPart = MoveData32->StartingVcn.QuadPart;
        MoveData = &MoveDataLocal;

    } else {
#endif

    //
    //  Get the input buffer pointer and check its length.
    //

    if (IrpSp->Parameters.FileSystemControl.InputBufferLength < sizeof( MOVE_FILE_DATA )) {

        NtfsCompleteRequest( IrpContext, Irp, STATUS_BUFFER_TOO_SMALL );
        return STATUS_BUFFER_TOO_SMALL;
    }

    MoveData = Irp->AssociatedIrp.SystemBuffer;

#if defined(_WIN64)
    }
#endif

    //
    //  Try to get a pointer to the file object from the handle passed in.
    //  Remember that we need to dereference this as some point but don't
    //  do it right away in case some gets in before we acquire it.
    //
    //
    //  NOTE: if the rdr ever allows this to be done remotely we'll have to
    //  change our verification since Irp->RequestorNode would be kernel but we'd
    //  still need to verify the handle
    //

    Status = ObReferenceObjectByHandle( MoveData->FileHandle,
                                        0,
                                        *IoFileObjectType,
                                        Irp->RequestorMode,
                                        &FileObject,
                                        NULL );

    if (!NT_SUCCESS(Status)) {

        NtfsCompleteRequest( IrpContext, Irp, Status );
        return Status;
    }

    //
    //  Check that this file object is opened on the same volume as the
    //  DASD handle used to call this routine.
    //

    if (FileObject->Vpb != Vcb->Vpb) {

        ObDereferenceObject( FileObject );

        NtfsCompleteRequest( IrpContext, Irp, STATUS_INVALID_PARAMETER );
        return STATUS_INVALID_PARAMETER;
    }

    //
    //  Now decode this FileObject. We don't care to raise on dismounts here
    //  because we check for that further down anyway. Hence, RaiseOnError=FALSE.
    //

    TypeOfOpen = NtfsDecodeFileObject( IrpContext, FileObject, &Vcb, &Fcb, &Scb, &Ccb, FALSE );

    //
    //  Limit the files we will allow defragging to.  We can't defrag a file which needs
    //  its own mapping to write log records (volume bitmap).  We also eliminate the
    //  log file and usn journal. For the MFT we disallow moving the first 16 non-user files
    //

    if (((TypeOfOpen != UserFileOpen) &&
         (TypeOfOpen != UserDirectoryOpen) &&
         (TypeOfOpen != UserViewIndexOpen)) ||
        FlagOn( Fcb->FcbState, FCB_STATE_PAGING_FILE ) ||
        ((NtfsSegmentNumber( &Fcb->FileReference ) < ATTRIBUTE_DEF_TABLE_NUMBER)  &&
         ((NtfsSegmentNumber( &Fcb->FileReference ) != MASTER_FILE_TABLE_NUMBER) ||
          (MoveData->StartingVcn.QuadPart < LlClustersFromBytes( Vcb, FIRST_USER_FILE_NUMBER * Vcb->BytesPerFileRecordSegment )))) ||
        FlagOn( Fcb->FcbState, FCB_STATE_USN_JOURNAL ) ||
        NtfsEqualMftRef( &Fcb->FileReference, &BitmapFileReference )) {

        ObDereferenceObject( FileObject );
        NtfsCompleteRequest( IrpContext, Irp, STATUS_INVALID_PARAMETER );

        return STATUS_INVALID_PARAMETER;
    }

    //
    //  Disallow defragging on a read-only volume
    //

    if (NtfsIsVolumeReadOnly( Vcb )) {

        NtfsCompleteRequest( IrpContext, Irp, STATUS_MEDIA_WRITE_PROTECTED );
        return STATUS_MEDIA_WRITE_PROTECTED;
    }

    //
    //  Verify that the start Vcn, Lcn and cluster count are valid values.
    //

    if ((MoveData->StartingVcn.QuadPart < 0) ||
        (MoveData->StartingVcn.QuadPart + MoveData->ClusterCount < MoveData->ClusterCount) ||
        (Vcb->MaxClusterCount < MoveData->StartingVcn.QuadPart + MoveData->ClusterCount) ||
        (MoveData->StartingLcn.QuadPart < 0) ||
        (MoveData->StartingLcn.QuadPart >= Vcb->TotalClusters)) {

        ObDereferenceObject( FileObject );
        NtfsCompleteRequest( IrpContext, Irp, STATUS_INVALID_PARAMETER );
        return STATUS_INVALID_PARAMETER;

    }

    NtfsInitializeAttributeContext( &AttrContext );

    try {

        //
        //  For system files we need the vcb to test for dismounted volumes
        //

        if (FlagOn( Scb->Fcb->FcbState, FCB_STATE_SYSTEM_FILE )) {
            NtfsAcquireExclusiveVcb( IrpContext, Vcb, TRUE );
            AcquiredVcb = TRUE;

            if (!FlagOn(Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED )) {
                try_return( Status = STATUS_VOLUME_DISMOUNTED );
            }
        }

        //
        //  We now want to acquire the Scb to check if we can continue.  It is
        //  important to test whether this Scb has a paging io resource, not
        //  whether the Fcb has one.  Consider the case where a directory has
        //  a named data stream in it -- the Fcb will have a paging io resource,
        //  but the index root Scb will not.  In that case it would be a mistake
        //  to acquire the Fcb's paging io resource, since that will not serialize
        //  this operation with NtfsAcquireFileForCcFlush.
        //

        SetFlag( IrpContext->Flags, IRP_CONTEXT_FLAG_ACQUIRE_PAGING );
        NtfsAcquireFcbWithPaging( IrpContext, Scb->Fcb, 0 );
        AcquiredScb = TRUE;

        if (FlagOn( Scb->ScbState, SCB_STATE_VOLUME_DISMOUNTED )) {

            try_return( Status = STATUS_VOLUME_DISMOUNTED );
        }

        //
        //  Check for the deny defrag being set
        //

        if (FlagOn( Scb->ScbPersist, SCB_PERSIST_DENY_DEFRAG ) && !FlagOn( Ccb->Flags, CCB_FLAG_DENY_DEFRAG )) {

            try_return( Status = STATUS_ACCESS_DENIED );
        }

        //
        //  Initialize the header if necc. If the attribute doesn't exist
        //  just leave - for instance an index allocation buffer
        //

        if (!NtfsLookupAttributeByName( IrpContext,
                                       Fcb,
                                       &Fcb->FileReference,
                                       Scb->AttributeTypeCode,
                                       &Scb->AttributeName,
                                       0,
                                       FALSE,
                                       &AttrContext )) {

            try_return( Status = STATUS_SUCCESS );
        }

        if (!FlagOn( Scb->ScbState, SCB_STATE_HEADER_INITIALIZED )) {
            NtfsUpdateScbFromAttribute( IrpContext, Scb, NtfsFoundAttribute( &AttrContext ) );
        }

        if ((TypeOfOpen == UserDirectoryOpen) || (TypeOfOpen == UserViewIndexOpen)) {

            //
            //  Initialize the Index information in the Scb if not done yet for indices.
            //

            if (Scb->ScbType.Index.BytesPerIndexBuffer == 0) {

                NtfsCleanupAttributeContext( IrpContext, &AttrContext );
                NtfsInitializeAttributeContext( &AttrContext );

                if (!NtfsLookupAttributeByName( IrpContext,
                                                Fcb,
                                                &Fcb->FileReference,
                                                $INDEX_ROOT,
                                                &Scb->AttributeName,
                                                0,
                                                FALSE,
                                                &AttrContext )) {

                    ASSERTMSG("Could not find Index Root for Scb\n", FALSE);
                    NtfsRaiseStatus( IrpContext, STATUS_FILE_CORRUPT_ERROR, NULL, Scb->Fcb );
                }

                NtfsUpdateIndexScbFromAttribute( IrpContext, Scb, NtfsFoundAttribute(&AttrContext), FALSE );
            }

            //
            //  Mark the irpcontext so we don't recursively push the index root while defragging
            //  the index. If we hit this on retry the force push flag will be set and we can safely
            //  pre-push the index
            //

            if (FlagOn( IrpContext->State, IRP_CONTEXT_STATE_FORCE_PUSH )) {
                NtfsPushIndexRoot( IrpContext, Scb );
            }
            SetFlag( IrpContext->Flags, IRP_CONTEXT_FLAG_DEFERRED_PUSH );
        }

        //
        //  Cleanup the attribute context now to remove bcbs
        //

        NtfsCleanupAttributeContext( IrpContext, &AttrContext );

        //
        //  If the stream is resident then we can return SUCCESS immediately.
        //  If the starting point is beyond file allocation then we can also
        //  return immediately.
        //

        FileOffset = (LONGLONG) LlBytesFromClusters( Vcb, MoveData->StartingVcn.QuadPart );
        ASSERT( FileOffset >= 0 );

        if (FlagOn( Scb->ScbState, SCB_STATE_ATTRIBUTE_RESIDENT ) ||
            (Scb->Header.AllocationSize.QuadPart < FileOffset)) {

            try_return( Status = STATUS_SUCCESS );
        }

        //
        //  Setup the intermediate buffer
        //

        ASSERT( LARGE_BUFFER_SIZE >= Vcb->BytesPerCluster );

        if (LARGE_BUFFER_SIZE > Vcb->BytesPerCluster) {
            BufferLength = LARGE_BUFFER_SIZE;
        } else {
            BufferLength = Vcb->BytesPerCluster;
        }

        IrpContext->Union.NtfsIoContext = &NtfsIoContext;
        RtlZeroMemory( IrpContext->Union.NtfsIoContext, sizeof( NTFS_IO_CONTEXT ));
        KeInitializeEvent( &IrpContext->Union.NtfsIoContext->Wait.SyncEvent,
                           NotificationEvent,
                           FALSE );

        DeletePendingFailureCountsLeft = 10;

        NtfsReleaseFcbWithPaging( IrpContext, Scb->Fcb );
        AcquiredScb = FALSE;

        if (AcquiredVcb) {
            NtfsReleaseVcb( IrpContext, Vcb );
            AcquiredVcb = FALSE;
        }

        if (IrpContext->TransactionId != 0) {

            ASSERT( !AcquiredAllFiles );

            //
            //  Complete the request which commits the pending
            //  transaction if there is one and releases of the
            //  acquired resources.  The IrpContext will not
            //  be deleted because the no delete flag is set.
            //

            SetFlag( IrpContext->Flags, IRP_CONTEXT_FLAG_DONT_DELETE | IRP_CONTEXT_FLAG_RETAIN_FLAGS );
            NtfsCompleteRequest( IrpContext, NULL, STATUS_SUCCESS );
        }

        //
        //  Main loop - while there are more clusters requested to move try to move them
        //

        while (MoveData->ClusterCount > 0) {

            LCN Lcn;
            LONGLONG ClusterCount;
            LONGLONG TransferSize;
            LONGLONG TransferClusters;

            try {

                try {

                    //
                    //  If necc. grab all the files to synchronzie with any transactions
                    //  flush the log and try to free recently deallocated clusters
                    //

                    if (FreeRecentlyDeallocated) {

                        FreeRecentlyDeallocated = FALSE;

                        try {

                            NtfsPurgeFileRecordCache( IrpContext );

                            //
                            //  Acquire all files to flush the log file and free recently deallocated.
                            //  Note the flush may raise, normally log file full, which will get rid of
                            //  the recently deallocated in a less efficient manner.
                            //

                            NtfsAcquireAllFiles( IrpContext, IrpContext->Vcb, FALSE, FALSE, FALSE );
                            AcquiredAllFiles = TRUE;

                            //
                            //  Since we've dropped and reacquired all thes file, we must retest
                            //  whether the volume has been dismounted. Use the vcb since acquireallfiles
                            //  grabs it
                            //

                            if (!FlagOn( IrpContext->Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED )) {

                                //
                                //  Raise we don't try to acquire the Scb exclusive in the try-finally
                                //  below.  We only hold this resource shared from the AcquireAllFiles
                                //  above.  It is OK to clear the REALLOCATE_ON_WRITE bit somewhat
                                //  unsynchronized since we will never touch this file again.
                                //

                                NtfsRaiseStatus( IrpContext, STATUS_VOLUME_DISMOUNTED, NULL, NULL );
                            }

                            LfsFlushToLsn( IrpContext->Vcb->LogHandle, LiMax );
                            NtfsFreeRecentlyDeallocated( IrpContext, IrpContext->Vcb, &LiMax, TRUE );


                        } finally {

                            if (AcquiredAllFiles) {

                                NtfsReleaseAllFiles( IrpContext, IrpContext->Vcb, FALSE );
                                AcquiredAllFiles = FALSE;
                            }
                        }
                    }

                    //
                    //  Purge anything left in cache because we hold nothing at this point
                    //

                    NtfsPurgeFileRecordCache( IrpContext );


                    //
                    //  For system files we need the vcb to test for dismounted volumes
                    //

                    if (FlagOn( Scb->Fcb->FcbState, FCB_STATE_SYSTEM_FILE )) {
                        NtfsAcquireExclusiveVcb( IrpContext, Vcb, TRUE );
                        AcquiredVcb = TRUE;

                        if (!FlagOn(Vcb->VcbState, VCB_STATE_VOLUME_MOUNTED )) {
                            try_return( Status = STATUS_VOLUME_DISMOUNTED );
                        }
                    }

                    //
                    //  Reacquire everything for the defrag mft case + the mft flush
                    //  resource so we know lazy writes aren't active while we're doing stuff
                    //

                    if (NtfsSegmentNumber( &Fcb->FileReference ) == MASTER_FILE_TABLE_NUMBER) {

                        NtfsAcquireAllFiles( IrpContext, Vcb, TRUE, FALSE, FALSE );
                        AcquiredAllFiles = TRUE;

                        ExAcquireResourceExclusiveLite( &Vcb->MftFlushResource, TRUE );

                    } else {

                        NtfsAcquireFcbWithPaging( IrpContext, Scb->Fcb, 0 );
                        AcquiredScb = TRUE;

                        //
                        //  Since we've dropped and reacquired the Scb, we must retest
                        //  whether the volume has been dismounted.
                        //

                        if (FlagOn( Scb->ScbState, SCB_STATE_VOLUME_DISMOUNTED )) {

                            try_return( Status = STATUS_VOLUME_DISMOUNTED );
                        }
                    }

                    //
                    //  If we acquired all the files above now do the work to check for free space in the mft
                    //

                    if (AcquiredAllFiles && (Vcb->MftScb->ScbType.Mft.RecordAllocationContext.NumberOfFreeBits <= 1)) {

                        MFT_SEGMENT_REFERENCE FileNumber;

#ifdef BENL_DBG
                        KdPrint(( "NTFS: too few mft records: 0x%x\n", Vcb->MftScb->ScbType.Mft.RecordAllocationContext.NumberOfFreeBits ));
#endif

                        FileNumber = NtfsAllocateMftRecord( IrpContext, Vcb, FALSE );
                        ASSERT( 0 == FileNumber.SegmentNumberHighPart );

                        NtfsDeallocateMftRecord( IrpContext, Vcb, FileNumber.SegmentNumberLowPart );
                        NtfsCheckpointCurrentTransaction( IrpContext );
#ifdef BENL_DBG
                        KdPrint(( "NTFS: after corection mft records: 0x%x\n", Vcb->MftScb->ScbType.Mft.RecordAllocationContext.NumberOfFreeBits ));
#endif

                        ASSERT( Vcb->MftScb->ScbType.Mft.RecordAllocationContext.NumberOfFreeBits > 1 );
                    }

                    //
                    //  Check if the attribute was deleted in between
                    //

                    if (FlagOn( Scb->ScbState, SCB_STATE_ATTRIBUTE_DELETED)) {
                        try_return( Status = STATUS_FILE_DELETED );
                    }

                    //
                    //  Leave if after regaining the file locks we are out of range
                    //

                    if (MoveData->StartingVcn.QuadPart > LlClustersFromBytes( Vcb, Scb->Header.AllocationSize.QuadPart )) {
                        break;
                    }

                    //
                    //  Check if this range of allocation exists - if not we can skip any work
                    //

                    if (NtfsLookupAllocation( IrpContext, Scb, MoveData->StartingVcn.QuadPart, &Lcn, &ClusterCount, NULL, NULL )) {

                        //
                        //  Now loop over the current range moving pieces of it
                        //

                        while ((MoveData->ClusterCount > 0) && (ClusterCount > 0)) {

                            LONGLONG UpperBound;

                            if (ClusterCount > MoveData->ClusterCount) {
                                TransferSize = LlBytesFromClusters( Vcb, MoveData->ClusterCount );
                            } else {
                                TransferSize = LlBytesFromClusters( Vcb, ClusterCount );
                            }
                            if (TransferSize > BufferLength ) {
                                TransferSize = BufferLength;
                            }
                            TransferClusters = LlClustersFromBytesTruncate( Vcb, TransferSize );

                            //
                            //  Reserve the new cluster if it falls within volume range
                            //

                            if (MoveData->StartingLcn.QuadPart + TransferClusters > Vcb->TotalClusters) {
                                NtfsRaiseStatus( IrpContext, STATUS_ALREADY_COMMITTED, NULL, NULL );
                            }

                            NtfsPreAllocateClusters( IrpContext, Vcb, MoveData->StartingLcn.QuadPart, TransferClusters, &AcquiredBitmap, &AcquiredMft );

                            //
                            //  Only actually transfer ranges within VDD or VDL - for those between
                            //  VDD and allocation size just reallocate. Use VDD for data streams
                            //  for all others that don't update VDD use VDL
                            //

                            if (($DATA == Scb->AttributeTypeCode) &&
                                !FlagOn( Scb->ScbState, SCB_STATE_MODIFIED_NO_WRITE ) &&
                                FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_COMPRESSION_MASK)) {

                                //
                                //  Modified no write streams don't use VDD. The only data
                                //  stream currently like this is $Secure and $UsnJrnl which are not
                                //  defraggable
                                //

                                UpperBound = LlClustersFromBytes( Vcb, Scb->ValidDataToDisk );

                            } else {
                                UpperBound = LlClustersFromBytes( Vcb, Scb->Header.ValidDataLength.QuadPart );
                            }

                            if (MoveData->StartingVcn.QuadPart <= UpperBound) {

                                //
                                //  Call the storage and see if they support
                                //  the copy data ioctl - this allows lower drivers to
                                //  implement a more efficient version of the copy and participate
                                //  particularly in volsnap's case in the defrag
                                //

                                if (IoctlSupported) {

                                    DISK_COPY_DATA_PARAMETERS CopyData;

                                    CopyData.SourceOffset.QuadPart = LlBytesFromClusters( Vcb, Lcn );
                                    CopyData.DestinationOffset.QuadPart = LlBytesFromClusters( Vcb, MoveData->StartingLcn.QuadPart );
                                    CopyData.CopyLength.QuadPart = TransferSize;
                                    CopyData.Reserved = 0;

                                    Status = NtfsDeviceIoControl( IrpContext,
                                                                  Vcb->TargetDeviceObject,
                                                                  IOCTL_DISK_COPY_DATA,
                                                                  &CopyData,
                                                                  sizeof( CopyData ),
                                                                  NULL,
                                                                  0,
                                                                  NULL );
                                }

                                if (!IoctlSupported || !NT_SUCCESS( Status )) {

                                    Status = STATUS_SUCCESS;
                                    IoctlSupported = FALSE;

                                    NtfsCreateMdlAndBuffer( IrpContext,
                                                            Scb,
                                                            RESERVED_BUFFER_ONE_NEEDED,
                                                            &BufferLength,
                                                            &Mdl,
                                                            &Buffer );
                                    Irp->MdlAddress = Mdl;

                                    //
                                    //  First read the cluster
                                    //

                                    NtfsSingleAsync( IrpContext,
                                                     Vcb->TargetDeviceObject,
                                                     LlBytesFromClusters( Vcb, Lcn ),
                                                     (ULONG)TransferSize,
                                                     Irp,
                                                     IRP_MJ_READ,
                                                     0 );

                                    NtfsWaitSync( IrpContext );

                                    NtfsNormalizeAndCleanupTransaction( IrpContext,
                                                                        &Irp->IoStatus.Status,
                                                                        TRUE,
                                                                        STATUS_UNEXPECTED_IO_ERROR );

                                    //
                                    //  Clear return info field
                                    //

                                    Irp->IoStatus.Information = 0;

                                    //
                                    //  Then transfer it to the new location
                                    //

                                    NtfsSingleAsync( IrpContext,
                                                     Vcb->TargetDeviceObject,
                                                     LlBytesFromClusters( Vcb, MoveData->StartingLcn.QuadPart ),
                                                     (ULONG)TransferSize,
                                                     Irp,
                                                     IRP_MJ_WRITE,
                                                     0 );

                                    NtfsWaitSync( IrpContext );


                                    NtfsNormalizeAndCleanupTransaction( IrpContext,
                                                                        &Irp->IoStatus.Status,
                                                                        TRUE,
                                                                        STATUS_UNEXPECTED_IO_ERROR );

                                        Irp->IoStatus.Information = 0;

                                    //
                                    //  Release the buffer before calling lfs which may also need the reserved buffer
                                    //

                                    NtfsDeleteMdlAndBuffer( Mdl, Buffer );
                                    Irp->MdlAddress = NULL;
                                    Buffer = NULL;
                                }
                            }

                            //
                            //  Finally reallocate the cluster in the scb and checkpoint it
                            //

                            NtfsReallocateRange( IrpContext, Scb, MoveData->StartingVcn.QuadPart, TransferClusters, MoveData->StartingVcn.QuadPart, TransferClusters, &MoveData->StartingLcn.QuadPart );
                            NtfsCheckpointCurrentTransaction( IrpContext );

                            ASSERT( IrpContext->TransactionId == 0 );

                            if (AcquiredBitmap) {
                                NtfsReleaseScb( IrpContext, Vcb->BitmapScb );
                                AcquiredBitmap = FALSE;
                            }

                            if (AcquiredMft) {
                                NtfsReleaseScb( IrpContext, Vcb->MftScb );
                                AcquiredMft = FALSE;
                            }

                            MoveData->StartingLcn.QuadPart += TransferClusters;
                            MoveData->StartingVcn.QuadPart += TransferClusters;
                            MoveData->ClusterCount -= (ULONG)TransferClusters;
                            ClusterCount -= TransferClusters;
                            Lcn += TransferClusters;

                        } // endwhile loop over lcn range

                    } else {

                        //
                        //  This is a hole skip over it
                        //

                        MoveData->StartingVcn.QuadPart += ClusterCount;
                        if (ClusterCount > MoveData->ClusterCount) {
                            MoveData->ClusterCount = 0;
                        } else {
                            MoveData->ClusterCount -= (ULONG)ClusterCount;
                        }
                    }

                } except( NtfsDefragExceptionFilter( IrpContext, GetExceptionInformation(), &DeletePendingFailureCountsLeft )) {

                    //
                    //  Cleanup the delete pending failure and next time through the loop
                    //  try to free the recently deallocated clusters to allow the cluster to be deleted
                    //

                    NtfsMinimumExceptionProcessing( IrpContext );
                    IrpContext->ExceptionStatus = STATUS_SUCCESS;

                    FreeRecentlyDeallocated = TRUE;

                }

            } finally {

                //
                //  Unlock the file and let anyone else access the file before
                //  looping back.
                //

                if (Buffer != NULL) {
                    NtfsDeleteMdlAndBuffer( Mdl, Buffer );
                    Irp->MdlAddress = NULL;
                    Buffer = NULL;
                }

                if (AcquiredBitmap) {
                    NtfsReleaseScb( IrpContext, Vcb->BitmapScb );
                    AcquiredBitmap = FALSE;
                }

                if (AcquiredMft) {
                    NtfsReleaseScb( IrpContext, Vcb->MftScb );
                    AcquiredMft = FALSE;
                }

                if (AcquiredScb) {
                    NtfsReleaseFcbWithPaging( IrpContext, Scb->Fcb );
                    AcquiredScb = FALSE;
                }

                if (AcquiredAllFiles) {
                    ExReleaseResourceLite( &Vcb->MftFlushResource );
                    NtfsReleaseAllFiles( IrpContext, Vcb, FALSE );
                    AcquiredAllFiles = FALSE;
                }

                if (AcquiredVcb) {
                    NtfsReleaseVcb( IrpContext, Vcb );
                    AcquiredVcb = FALSE;
                }
            }
        } // endwhile

        Status = STATUS_SUCCESS;

    try_exit: NOTHING;

    } finally {

        DebugUnwind( NtfsDefragFile );

        NtfsCleanupAttributeContext( IrpContext, &AttrContext );
        IrpContext->Union.NtfsIoContext = NULL;

        ASSERT( !AbnormalTermination() || (IrpContext->ExceptionStatus != STATUS_SUCCESS) );

        ASSERT( (Buffer == NULL) &&
                !AcquiredBitmap &&
                !AcquiredMft &&
                !AcquiredAllFiles );

        if (AcquiredScb) {
            NtfsReleaseFcbWithPaging( IrpContext, Scb->Fcb );
        }

        if (AcquiredVcb) {
            NtfsReleaseVcb( IrpContext, Vcb );
            AcquiredVcb = FALSE;
        }

        //
        //  Remove our reference on the users file object.
        //

        ObDereferenceObject( FileObject );
    }

    NtfsCompleteRequest( IrpContext, Irp, Status );
    return Status;
}


LONG
NtfsDefragExceptionFilter (
    IN PIRP_CONTEXT IrpContext OPTIONAL,
    IN PEXCEPTION_POINTERS ExceptionPointer,
    IN OUT PULONG DeletePendingFailureCountsLeft
    )

/*++

Routine Description:

    Exception handler for defrag - pass on for all exceptions other than delete pending
    in that case if there the number of retries left is > 0 execute the handler


Arguments:

    ExceptionPointer - Supplies the exception record to being checked.

    DeletePendingFailureCountsLeft - how many more times to retry a delete pending

Return Value:

    ULONG - returns EXCEPTION_EXECUTE_HANDLER or CONTINUE_SEARCH

--*/

{
    UNREFERENCED_PARAMETER( IrpContext );

    if ((STATUS_DELETE_PENDING == ExceptionPointer->ExceptionRecord->ExceptionCode)) {

        *DeletePendingFailureCountsLeft -= 1;
        if ((*DeletePendingFailureCountsLeft) > 0) {
            return EXCEPTION_EXECUTE_HANDLER;
        } else {
            return EXCEPTION_CONTINUE_SEARCH;
        }

    } else {
        return EXCEPTION_CONTINUE_SEARCH;
    }
}

//
//  Because of protocol limitations in CIFS which uses 16 bits,
//  redirector can't currently accept buffer sizes larger than 64K.
//

#define RDR_BUFFER_SIZE_LIMIT    0x00010000L


NTSTATUS
NtfsReadFromPlex(
    IN PIRP_CONTEXT IrpContext,
    IN PIRP Irp
    )

/*++

Routine Description:

    This implements directed reads from plexes. Given an offset, a length and a plexnumber
    along with a handle to a file or a volume, this coordinates reads from an underlying
    dynamic (mirrored) volume manager.

    Note that we ignore the VcbState flag VCB_STATE_NO_SECONDARY_AVAILABLE altogether
    and let the lower level driver respond.

Arguments:

    IrpContext - Supplies the IrpContext to process
    Irp - Incoming FSCTL IRP.

Return Value:

    Status SUCCESS on success, otherwise the relevant error code.

--*/

{
    PPLEX_READ_DATA_REQUEST ReadData;
    PIO_STACK_LOCATION IrpSp;
    ULONG InputBufferLength;
    ULONG UserBufferLength;
    NTSTATUS Status = STATUS_SUCCESS;
    BOOLEAN Wait = TRUE;
    ULONG NumberOfRuns, RemainingByteCount;
    COMPRESSION_CONTEXT CompContext;
    TYPE_OF_OPEN TypeOfOpen;
    IO_RUN IoRuns[NTFS_MAX_PARALLEL_IOS];
    VBO ByteOffset;
    ULONG ByteCount;
    ULONG BytesToEof;
    ULONG LastReadByteCount;
    ULONG CurByteCount;
    LOGICAL AcquiredScb = FALSE;
    VOLUME_READ_PLEX_INPUT NplexRead;
    PVCB Vcb;
    PSCB Scb;
    PFCB Fcb;
    PCCB Ccb;

    //
    //  Extract and decode the file object
    //

    IrpSp = IoGetCurrentIrpStackLocation( Irp );
    TypeOfOpen = NtfsDecodeFileObject( IrpContext,
                                       IrpSp->FileObject,
                                       &Vcb,
                                       &Fcb,
                                       &Scb,
                                       &Ccb,
                                       FALSE );

    //
    //  FileOpens and VolumeOpens are allowed.
    //

    if ((TypeOfOpen != UserFileOpen) &&
        (TypeOfOpen != UserVolumeOpen)) {

        Status = STATUS_INVALID_PARAMETER;
        NtfsCompleteRequest( IrpContext, Irp, Status );
        DebugTrace( -1, Dbg, ("NtfsReadFromPlex -> %08lx\n", Status) );
        return Status;
    }

    //
    //  This FSCTL is of type METHOD_OUT_DIRECT. The Io Manager has already
    //  copied the input parameters into the systembuffer field, probed the
    //  output buffer and locked the Mdls for us. So we can access these fields
    //  without fear.
    //

    ReadData = (PPLEX_READ_DATA_REQUEST)Irp->AssociatedIrp.SystemBuffer;

    if (ReadData == NULL) {

        Status = STATUS_INVALID_PARAMETER;
        NtfsCompleteRequest( IrpContext, Irp, Status );
        DebugTrace( -1, Dbg, ("NtfsReadFromPlex -> %08lx\n", Status) );
        return Status;
    }

    ByteOffset = ReadData->ByteOffset.QuadPart;
    ByteCount = ReadData->ByteLength;

    //
    //  Now, do the grunt work and clean up within a try finally.
    //

    try {

        //
        //  Sanity check the read length.
        //

    check_values:

        CurByteCount = 0;
        BytesToEof = 0;
        Irp->IoStatus.Information = 0;

        if ((ByteCount > MAXLONGLONG - ByteOffset) ||

            //
            //  File offsets should be cluster aligned
            //

            ((TypeOfOpen == UserFileOpen) &&
             ((ByteOffset & Vcb->ClusterMask) || (ByteCount & Vcb->ClusterMask))) ||

            //
            //  Volume offsets should be sector aligned
            //

            ((TypeOfOpen == UserVolumeOpen) &&
             (((ULONG)ByteOffset & (Vcb->BytesPerSector - 1)) || (ByteCount & (Vcb->BytesPerSector - 1))))) {

            Status = STATUS_INVALID_PARAMETER;
            leave;
        }

        //
        //  No-op
        //

        if (ByteCount == 0) {

            ASSERT(Status == STATUS_SUCCESS);
            ASSERT(CurByteCount == ByteCount);
            leave;
        }

        //
        //  Because of protocol limitations in CIFS which uses 16 bits,
        //  redirector can't accept buffer sizes larger than 64K.
        //

        if (ByteCount & ~(RDR_BUFFER_SIZE_LIMIT - 1L)) {

            Status = STATUS_INVALID_BUFFER_SIZE;
            leave;
        }

        //
        //  Sanity check input/output parameters.
        //

        InputBufferLength = IrpSp->Parameters.FileSystemControl.InputBufferLength;
        UserBufferLength = IrpSp->Parameters.FileSystemControl.OutputBufferLength;

        if ((InputBufferLength < sizeof( PLEX_READ_DATA_REQUEST )) ||
            (UserBufferLength < ByteCount)) {

            Status = STATUS_BUFFER_TOO_SMALL;
            leave;
        }

        //
        //  For volume DASD reads, we just send an IOCTL down...
        //

        if (TypeOfOpen == UserVolumeOpen) {

            NplexRead.ByteOffset.QuadPart = ByteOffset;
            NplexRead.Length = ByteCount;
            NplexRead.PlexNumber = ReadData->PlexNumber;

            Status = NtfsDeviceIoControl( IrpContext,
                                          Vcb->TargetDeviceObject,
                                          IOCTL_VOLUME_READ_PLEX,
                                          &NplexRead,
                                          sizeof( VOLUME_READ_PLEX_INPUT ),
                                          NtfsMapUserBuffer( Irp, NormalPagePriority ),
                                          ByteCount,
                                          &Irp->IoStatus.Information );

            ASSERT(!NT_SUCCESS( Status ) || Irp->IoStatus.Information != 0);
            DebugTrace( 0, Dbg, ("NtfsReadFromPlex: VolumeRead\n") );
            leave;
        }

        NtfsAcquireSharedScb( IrpContext, Scb );
        AcquiredScb = TRUE;

        //
        //  If the volume isn't mounted then fail immediately.
        //

        if (FlagOn( Scb->ScbState, SCB_STATE_VOLUME_DISMOUNTED )) {

            Status = STATUS_VOLUME_DISMOUNTED;
            leave;
        }

        //
        //  We don't get along with encrypted/compressed/sparse things.
        //  ISSUE: supw: actually sparse should be ok, now that i'm using preparebuffers.
        //

        if (FlagOn( Scb->AttributeFlags, ATTRIBUTE_FLAG_ENCRYPTED |
                                         ATTRIBUTE_FLAG_COMPRESSION_MASK |
                                         ATTRIBUTE_FLAG_SPARSE )) {

            DebugTrace( 0, Dbg, ("NtfsReadFromPlex: File encrypted or compressed -> %08lx\n",
                                  STATUS_INVALID_PARAMETER) );
            Status = STATUS_INVALID_PARAMETER;
            leave;
        }

        NtfsAcquireFsrtlHeader( Scb );

        //
        //  Make sure we aren't starting past the end of the file, in which case
        //  we would have nothing to return.
        //

        if (ByteOffset >= Scb->Header.FileSize.QuadPart) {

            DebugTrace( 0, Dbg, ("NtfsReadFromPlex: beyond eof\n") );
            Status = STATUS_END_OF_FILE;
            NtfsReleaseFsrtlHeader( Scb );
            leave;
        }

        //
        //  We can't read beyond filesize.
        //

        if (Scb->Header.FileSize.QuadPart - ByteOffset < ByteCount) {

            BytesToEof = ByteCount = (ULONG)(Scb->Header.FileSize.QuadPart - ByteOffset);
            ByteCount = ClusterAlign( Vcb, ByteCount );

            //
            //  We need to sanity check ByteCount again, since we rounded it up.
            //

            NtfsReleaseFsrtlHeader( Scb );
            ASSERT( AcquiredScb );
            NtfsReleaseScb( IrpContext, Scb );
            goto check_values;
        }

        NtfsReleaseFsrtlHeader( Scb );

        //
        //  Can't deal with resident files.
        //

        if (FlagOn( Scb->ScbState, SCB_STATE_ATTRIBUTE_RESIDENT )) {

            Status = STATUS_NOT_IMPLEMENTED;
            leave;
        }

        //
        //  PrepareBuffers needs a CompressionContext for the IO_RUN array.
        //

        RtlZeroMemory( &CompContext, sizeof(COMPRESSION_CONTEXT) );
        CompContext.IoRuns = IoRuns;
        CompContext.AllocatedRuns = NTFS_MAX_PARALLEL_IOS;
        CompContext.FinishBuffersNeeded = FALSE;

        //
        //  Get the run information, and send the IOCTL down.
        //

        while (TRUE) {

            ULONG RunCount;
            ULONG_PTR SizeOfThisRead;
            Irp->IoStatus.Status = STATUS_SUCCESS;

            //
            //  Build an array of io runs to do our reads from.
            //

            RemainingByteCount = NtfsPrepareBuffers( IrpContext,
                                                     Irp,
                                                     Scb,
                                                     &ByteOffset,
                                                     ByteCount,
                                                     0,
                                                     &Wait,
                                                     &NumberOfRuns,
                                                     &CompContext );

            ASSERT( RemainingByteCount < ByteCount );
            ASSERT( Wait == TRUE );
            ASSERT( NumberOfRuns > 0 );
            ASSERT( NumberOfRuns > 1 || RemainingByteCount == 0 );

            //
            //  Send synchronous IOCTLs down to do the plex reads.
            //

            for (RunCount = 0;
                 RunCount < NumberOfRuns;
                 RunCount += 1) {

                NplexRead.ByteOffset.QuadPart = CompContext.IoRuns[RunCount].StartingLbo;
                NplexRead.Length = CompContext.IoRuns[RunCount].ByteCount;
                NplexRead.PlexNumber = ReadData->PlexNumber;

                //
                //  While CurByteCOunt keeps track of the total amount of bytes read,
                //  SizeOfThisRead carries the size of the last read done. This is usually
                //  equal to the IoRuns[].ByteCount.
                //

                SizeOfThisRead = 0;
                ASSERT(CompContext.IoRuns[RunCount].ByteCount > 0);

                Status = NtfsDeviceIoControl( IrpContext,
                                              Vcb->TargetDeviceObject,
                                              IOCTL_VOLUME_READ_PLEX,
                                              &NplexRead,
                                              sizeof(VOLUME_READ_PLEX_INPUT),
                                              Add2Ptr( NtfsMapUserBuffer( Irp, NormalPagePriority ), CurByteCount ),
                                              CompContext.IoRuns[RunCount].ByteCount,
                                              &SizeOfThisRead);

                if (!NT_SUCCESS( Status )) {

                    //
                    //  Success if we read anything at all.
                    //

                    if (CurByteCount != 0) {

                        Status = STATUS_SUCCESS;
                    }

                    leave;
                }

                //
                //  This value was taken from the Iosb.Information field of the subordinate
                //  IRP, and should contain a nonzero value for successful completions.
                //

                ASSERT( (SizeOfThisRead != 0) && ((ULONG) SizeOfThisRead <= CompContext.IoRuns[RunCount].ByteCount) );
                CurByteCount = CurByteCount + (ULONG) SizeOfThisRead;

                //
                //  We don't have any more space left
                //

                if (UserBufferLength <= (ULONG) SizeOfThisRead) {

                    ASSERT( Status == STATUS_SUCCESS );
                    leave;
                }

                UserBufferLength = UserBufferLength - (ULONG) SizeOfThisRead;
            }

            if (RemainingByteCount == 0) {

                ASSERT( Status == STATUS_SUCCESS );
                break;
            }

            //
            //  We have more to read. Make sure we have enough buffer space.
            //

            LastReadByteCount = ByteCount - RemainingByteCount;

            ByteOffset = ByteOffset + LastReadByteCount;
            CompContext.SystemBufferOffset = CompContext.SystemBufferOffset + LastReadByteCount;
            ByteCount = RemainingByteCount;

        }

    } finally {

        if (AcquiredScb) {

            NtfsReleaseScb( IrpContext, Scb );
        }

        //
        //  If nothing raised then complete the irp.
        //

        if (!AbnormalTermination()) {

            if (NT_SUCCESS( Status )) {

                //
                //  We have to be careful to zero beyond the filesize.
                //

                if (CurByteCount > BytesToEof) {

                    RtlZeroMemory( Add2Ptr( NtfsMapUserBuffer( Irp, NormalPagePriority ), BytesToEof ),
                                   CurByteCount - BytesToEof );
                    Irp->IoStatus.Information = BytesToEof;

                } else {

                    Irp->IoStatus.Information = CurByteCount;

                }
            }

            NtfsCompleteRequest( IrpContext, Irp, Status );
        }

    }

    DebugTrace( -1, Dbg, ("NtfsReadPlex-> %08lx\n", Status) );

    return Status;
}

#if EFSDBG

NTSTATUS
NtfsDummyEfsRead (
    IN OUT PUCHAR InOutBuffer,
    IN PLARGE_INTEGER Offset,
    IN ULONG BufferSize,
    IN PVOID Context
    )
{
#ifndef SYSCACHE
    ULONG LocalOffset = 0;
#endif
    UNREFERENCED_PARAMETER( Context );

    //
    //  Exit cleanly if this is the call that is just there to
    //  make sure the compiler doesn't throw this function out.
    //

    if (BufferSize != 0) {

#ifdef SYSCACHE
        if (FALSE && VerifySyscacheData) {

            FsRtlVerifySyscacheData( NULL,
                                     InOutBuffer,
                                     BufferSize,
                                     Offset->LowPart );
        }
#else
        ASSERT( (Offset->QuadPart & 0x1ff) == 0 );
        ASSERT( (BufferSize & 0x1ff) == 0 );

        while((LocalOffset + 8) < BufferSize) {

            *((PLONGLONG) Add2Ptr(InOutBuffer, LocalOffset)) ^= (Offset->QuadPart + (LONGLONG) LocalOffset);
            LocalOffset += 0x200;
        }
//        UNREFERENCED_PARAMETER( InOutBuffer );
//        UNREFERENCED_PARAMETER( Offset );
//        UNREFERENCED_PARAMETER( BufferSize );
#endif

    }

    //
    //  Not much to do, decryption is done in place, so we can just leave the bits
    //  in the buffer.
    //

    return STATUS_SUCCESS;
}

NTSTATUS
NtfsDummyEfsWrite (
    IN PUCHAR InBuffer,
    OUT PUCHAR OutBuffer,
    IN PLARGE_INTEGER Offset,
    IN ULONG BufferSize,
    IN PUCHAR Context
    )
{
#ifndef SYSCACHE
    ULONG LocalOffset = 0;
#endif
    UNREFERENCED_PARAMETER( Context );

    //
    //  Exit cleanly if this is the call that is just there to
    //  make sure the compiler doesn't throw this function out.
    //

    if (BufferSize != 0) {

        //
        //  Just copy the plaintext to the output buffer.
        //

        RtlCopyMemory( OutBuffer,
                       InBuffer,
                       BufferSize );

#ifdef SYSCACHE
        if (FALSE && VerifySyscacheData) {

            FsRtlVerifySyscacheData( NULL,
                                     OutBuffer,
                                     BufferSize,
                                     Offset->LowPart );
        }
#else
        ASSERT( (Offset->QuadPart & 0x1ff) == 0 );
        ASSERT( (BufferSize & 0x1ff) == 0 );

        while((LocalOffset + 8) < BufferSize) {

            *((PLONGLONG) Add2Ptr(OutBuffer, LocalOffset)) ^= (Offset->QuadPart + (LONGLONG) LocalOffset);
            LocalOffset += 0x200;
        }
//        UNREFERENCED_PARAMETER( Offset );
#endif
    }

    return STATUS_SUCCESS;
}

#endif