Bit Array

Do you ever need to simply track the existence or lack thereof of something? Just a simple 1 or 0? Sure, we all do. Now, have you ever had to do this for millions of items? If so, what would you use? Many employ boolean arrays (just peruse database and data science projects on GitHub). Would you be surprised to learn a boolean value is not “precisely defined” in terms of storage in Java? I was. What does this mean? Well, the size of a boolean is variable, making it a bit difficult to predict memory consumption. In general, however, the assumption of 1 byte per entry holds, as corroborated by many of the experiments I ran comparing UPC to Java. Back to your millions of booleans. If the count is double-digit millions, then a few megabytes is generally quite easily found. However, once you move into the hundreds of millions or even billions, memory consumption becomes a little much for what is essentially an on/off flag. Likewise, Java arrays are int-indexed, resulting in a hard limit of 231-1. Thus, instead of storing each flag as a byte, we can compress the array by a factor of eight using bit arrays.

Java’s API provides two BitArray classes. The first is com.sun.org.apache.xalan.internal.xsltc.dom.BitArray. This is designed for document parsing (obviously) and is limited in functionality. Its storage has an upper-bound of 231-1 as it only accepts ints as indices (theoretically, it could reach 236-1 with long indices – max array index of 231-1 times 32 bits in an int). It also performs a few additional if operations (expensive, especially if branch prediction is an issue) while setting a value, and it does not have a clear bit method. The second is sun.security.util.BitArray. This class is compiled in Oracle’s JDK, so the code reviewed was taken from OpenJDK 8u60. It uses a byte[] and has a bit clearing function, but is still bound by int indexing (theoretically, it could reach 234-1 with long indices – max array index of 231-1 times 8 bits in a byte). Its implementation, however, is less than optimized, as it relies on chaining mathematical operators instead of bitwise, and its set and clear functions are combined – the operation is determined by an if statement.

As a result, the following is a proposed BitArray class. Some of the basics are as follows. First, it uses a long[] for storage and long bit indexing, increasing capacity to 237-1 (max array index of 231-1 times 64 bits in a long). Second, we minimize the number of operations by using masks and bitwise operations. The following table provides more details about the type and number of operations for getting, setting, and clearing bits.

ClassStorageMax BitsGet OperationsSet OperationsClear Operations
dom.BitArrayint[]int index max
231-1 = 2,147,483,647
1 unary and 1 nested if
1 >>>
1 %
2 array reads (bits/mask)
1 &
1 !=0 comparison
4 unary and 1 nestedif
1 >>>
1 %
1 array read (mask)
1 |
1 array set
Not supported
util.BitArraybyte[]int index max
231-1 = 2,147,483,647
1 if < || >=
1 integer /
1 %
2 subtractions
1 <<
1 array read
1 &
1 !=0 comparison
1 if < || >=
1 unary if
1 integer /
1 %
2 subtractions
1 <<
1 |
1 array set
1 if < || >=
1 unary if
1 integer /
1 %
2 subtractions
1 <<
1 ~
1 &
1 array set
Proposedlong[]int*64 bits/long
237-1 = 137,438,953,471
1 if >= || <
1 >>>
1 %
2 array reads (bits/mask)
1 &
1!=0 comparison
1 if >= || <
1 >>>
1 %
1 array read (mask)
1 |
1 array set
1 if >= || <
1 >>>
1 %
1 array read (mask)
1 ~
1 &
1 array set
/**
 * A simple bit array class with sixty-four times the capacity of a normal Java array.
 * @author Ray Hylock
 */
public class BitArray {
    /** {@link Integer#MAX_VALUE} times 64 */
    public static final long MAX_LENGTH = (long)Integer.MAX_VALUE << 6L;
     
    // local values
    private long[] array;
    private long length;
     
    // masks
    private long[] masks = {  
        0x8000000000000000L, 0x4000000000000000L, 0x2000000000000000L, 0x1000000000000000L,
        0x0800000000000000L, 0x0400000000000000L, 0x0200000000000000L, 0x0100000000000000L,
        0x0080000000000000L, 0x0040000000000000L, 0x0020000000000000L, 0x0010000000000000L,
        0x0008000000000000L, 0x0004000000000000L, 0x0002000000000000L, 0x0001000000000000L,
        0x0000800000000000L, 0x0000400000000000L, 0x0000200000000000L, 0x0000100000000000L,
        0x0000080000000000L, 0x0000040000000000L, 0x0000020000000000L, 0x0000010000000000L,
        0x0000008000000000L, 0x0000004000000000L, 0x0000002000000000L, 0x0000001000000000L,
        0x0000000800000000L, 0x0000000400000000L, 0x0000000200000000L, 0x0000000100000000L,
        0x0000000080000000L, 0x0000000040000000L, 0x0000000020000000L, 0x0000000010000000L,
        0x0000000008000000L, 0x0000000004000000L, 0x0000000002000000L, 0x0000000001000000L,
        0x0000000000800000L, 0x0000000000400000L, 0x0000000000200000L, 0x0000000000100000L,
        0x0000000000080000L, 0x0000000000040000L, 0x0000000000020000L, 0x0000000000010000L,
        0x0000000000008000L, 0x0000000000004000L, 0x0000000000002000L, 0x0000000000001000L,
        0x0000000000000800L, 0x0000000000000400L, 0x0000000000000200L, 0x0000000000000100L,
        0x0000000000000080L, 0x0000000000000040L, 0x0000000000000020L, 0x0000000000000010L,
        0x0000000000000008L, 0x0000000000000004L, 0x0000000000000002L, 0x0000000000000001L
    };
     
    /**
     * Creates a new bit array with maximum possible length of 
     * {@link #MAX_LENGTH}.
     * @param length length of array
     */
    public BitArray(long length){
        // validate length
        if(length < 0)
            throw new ArrayIndexOutOfBoundsException("Length cannot be negative.");
        if(length > MAX_LENGTH)
            throw new ArrayIndexOutOfBoundsException("Length "+length+
                    " is larger than the maximum allowed: "+MAX_LENGTH+".");
         
        // instantiate array and set length
        array = new long[(int)Math.ceil(((double)length)/64d)];
        this.length = length;
    }
     
    /**************************************************************************
     *                             GETTER METHODS                             *
     **************************************************************************/
    /**
     * Get the length.
     * @return length
     */
    public long length(){
        return length;
    }
     
    /**
     * Returns {@code true} if set, {@code false} otherwise.
     * @param index index in array to check
     * @return {@code true} if set, {@code false} otherwise
     */
    public boolean get(long index){
        validate(index);
        return getNoCheck(index);
    }
     
    /**
     * Returns {@code true} if set, {@code false} otherwise without validating
     * the index position.
     * @param index index in array to check
     * @return {@code true} if set, {@code false} otherwise
     */
    public boolean getNoCheck(long index){
        return (array[(int)(index >>> 6)] & masks[(int)(index%64)]) != 0;
    }
     
    /**************************************************************************
     *                             SETTER METHODS                             *
     **************************************************************************/
    /**
     * Sets the bit at position {@code index}.
     * @param index index of bit to set
     */
    public void set(long index){
        validate(index);
        setNoCheck(index);
    }
     
    /**
     * Sets the bit at position {@code index} without validating the index 
     * position.
     * @param index index of bit to set
     */
    public void setNoCheck(long index){
        array[(int)(index >>> 6)] |= masks[(int)(index%64)];
    }
     
    /**
     * Clears the bit at position {@code index}.
     * @param index index of bit to clear
     */
    public void clear(long index){
        validate(index);
        clearNoCheck(index);
    }
     
    /**
     * Clears the bit at position {@code index} without validating the index 
     * position.
     * @param index index of bit to clear
     */
    public void clearNoCheck(long index){
        array[(int)(index >>> 6)] &= ~masks[(int)(index%64)];
    }
     
    /**************************************************************************
     *                            ITERATOR METHODS                            *
     **************************************************************************/
    /**
     * Returns a new iterator.
     * @return a new iterator
     */
    public BitArrayIterator iterator(){
        return new BitArrayIterator();
    }
     
    /**
     * Bit array iterator.
     */
    public class BitArrayIterator {
        // current index position
        private long idx;
        private int bitIdx, arrayIdx;
         
        /**
         * Instantiate a new bit array iterator.
         */
        public BitArrayIterator(){
            reset();
        }
         
        /**
         * Returns {@code true} if there are more bits, {@code false} otherwise.
         * @return {@code true} if there are more bits, {@code false} otherwise
         */
        public boolean hasNext(){
            return ++idx < length;
        }
         
        /**
         * Returns {@code true} if the next bit is set, {@code false} otherwise.
         * @return {@code true} if the next bit is set, {@code false} otherwise
         */
        public boolean next(){
            /* 20-30% faster than return isBitSet(idx). isBitSet(idx) is
             * necessary for random access. However, since we are accessing the
             * information sequentially, we can avoid a mod and shift operation
             * by way of a switch statement. We also avoid calling a method 
             * from its enclosing class and casting from longs to ints.*/
            switch(bitIdx){
                // [-1,63] because we increment before reading
                case 63: bitIdx = -1; arrayIdx++; break;
            }
            return (array[arrayIdx] & masks[++bitIdx]) != 0;
        }
         
        /**
         * Rests the iterator for reuse.
         */
        public void reset(){
            idx = -1;
            bitIdx = -1;
            arrayIdx = 0;
        }
    }
     
    /**************************************************************************
     *                            UTILITY METHODS                             *
     **************************************************************************/
    /**
     * Validate the index.
     * @param i index to validate
     */
    private void validate(long i){
        if(i >= length || i < 0)
            throw new ArrayIndexOutOfBoundsException("Array index out of range: "+i);
    }
}

The following is an example implementation of the BitArray class that (1) creates a bit array of size 3 billion (this would fail on both Java API BitArray classes), (2) sets values (3) gets values, (4) clears a value, and (5) utilizes BitArrayIterator (included in this class) to print every index containing a value of 1.

public class BitArrayTest {
    public static void main(String[] args) {
        long length = 3_000_000_000L;
        BitArray a = new BitArray(length);
        DecimalFormat df = new DecimalFormat("###,###,###,###,##0");
        out.println("length: "+df.format(a.length())+" = "
                +df.format(a.length()/8/1024/1024)+"MB");
         
        out.println("setting 0");              a.set(0);
        out.println("setting 9");              a.set(9);
        out.println("setting 7");              a.set(7);
        out.println("setting 3");              a.set(3);
        out.println("setting 2114854748");     a.set(2114854748); 
        out.println("setting 2454784216L");    a.set(2454784216L); 
         
        out.println("checking 0 = "           +a.get(0)          +" (should be true)");
        out.println("checking 1 = "           +a.get(1)          +" (should be false)");
        out.println("checking 2114854748 = "  +a.get(2114854748) +" (should be true)");
        out.println("checking 2454784216L = " +a.get(2454784216L)+" (should be true)");
        out.println("checking 29999999999L = "+a.get(2999999999L)+" (should be false)");
         
        out.println("checking 7 = "           +a.get(7)          +" (should be true)");
        out.println("clearning 7");            a.clear(7);
        out.println("checking 7 = "           +a.get(7)          +" (should be false)");
         
        out.println("Iterator...");
        BitArrayIterator it = a.iterator();
        long idx = 0;
        while(it.hasNext()){
            if(it.next()) out.println("set: "+idx);
            idx++;
        }
    }
}

Output from the example class (takes a few seconds because it iterates over 3 billion bits):

length: 3,000,000,000 = 357MB
setting 0
setting 9
setting 7
setting 3
setting 2114854748
setting 2454784216L
checking 0 = true (should be true)
checking 1 = false (should be false)
checking 2114854748 = true (should be true)
checking 2454784216L = true (should be true)
checking 29999999999L = false (should be false)
checking 7 = true (should be true)
clearning 7
checking 7 = false (should be false)
Iterator...
set: 0
set: 3
set: 9
set: 2114854748
set: 2454784216

If you’re interested, here is a very simple JUnit test.

import java.util.HashSet;
import java.util.concurrent.ThreadLocalRandom;
import org.junit.Test;
import static org.junit.Assert.*;
public class BitArrayJunitTest {
    private final long length = 3_000_000_000L, minIdx = 0, maxIdx = length-1;
    private final int maxItemsAdded = 100_000;
    private final BitArray bitArray = new BitArray(length);
    private final HashSet<Long> idxHash = new HashSet<>();
    private enum IS_SET { FALSE, TRUE };
    private enum LOOK { FORWARD, REVERSE };
    @Test
    public void test() {
        // check length
        assertEquals(length, bitArray.length());
         
        // check not set -> set -> check set
        for(int i=0; i<ThreadLocalRandom.current().nextInt(maxItemsAdded); i++)
            set(ThreadLocalRandom.current().nextLong(length), false, true);
         
        // check not set
        check(find(minIdx, IS_SET.FALSE, LOOK.FORWARD), false);
        check(find(maxIdx, IS_SET.FALSE, LOOK.REVERSE), false);
         
        // check set -> clear -> check not set
        clear(find(98718, IS_SET.TRUE, LOOK.FORWARD), true, false);
         
        // check set -> set -> check set (no change)
        set(find(241812781, IS_SET.TRUE, LOOK.REVERSE), true, true);
         
        // check not set -> clear -> check not set (no change)
        clear(find(549841, IS_SET.FALSE, LOOK.FORWARD), false, false);
         
        // test the iterator against known truth
        long idx = 0; BitArrayIterator it = bitArray.iterator();
        while(it.hasNext()) assertEquals(idxHash.contains(idx++), it.next());
    }
    private long find(long offset, IS_SET isSet, LOOK look){
        long ret = offset, step = (look == LOOK.FORWARD) ? 1 : -1;
        switch(isSet){
            case FALSE: while(idxHash.contains(ret)) ret += step; break;
            case TRUE: while(!idxHash.contains(ret)) ret += step; break;
        }
        return ret;
    }
    private void clear(long idx, boolean before, boolean after){
        idxHash.remove(idx);
        check(idx, before);
        bitArray.clear(idx);
        check(idx, after);
    }
    private void set(long idx, boolean before, boolean after){
        idxHash.add(idx);
        check(idx, before);
        bitArray.set(idx);
        check(idx, after);
    }
    private void check(long idx, boolean expected){
        assertEquals(expected, bitArray.get(idx));
    }
}