If you want to use hardware _sampling_, then converting to R8 (as well as making mipmaps, I guess) is the only way I can think of.
On the other hand, if you're OK with writing your own sampling routines, then you could do as follows:
1) Load data into R8 directly (so not convert/expand every bit into byte consisting of 0/255, but put each 8 bits of 1-bit deep texture into one texel of 8-bit deep texture)
2) Write your own sampling code, using texture read() functions, the following way (this is very pseudo code, no bounds checking, and all just typed here). This way you'll still get hardware cache support. It should work reasonably fast. And if memory serves it can be even a bit more accurate than "true" hardware sampling, which at least used to have limited precision on some hardware.
// coord is given in 1bit increments (so first bit has 0, second has 1 and so on)
uint read_1bit_packed(texture2d<uint> texture, uint2 coord)
{
uint shift = coord.x & 0x7;
coord.x >>= 3;
return (texture.read(coord) >> shift) & 0x1;
}
// coord is given in <0,1> range
float sample_1bit(texture2d<uint> texture, float2 coord)
{
// convert to floating point coordinates of sample position (in bits)
float2 access = float2(coord.x * texture.get_width() * 8, coord.y * texture.get_height());
// sampling indices and weights
uint i0 = uint(access.x), j0 = uint(access.y).
float h = access.x - i0, v = access.y - j0;
// load four values
float v00 = read_1bit_packed(texture, uint2(i0, j0)),
v10 = read_1bit_packed(texture, uint2(i0 + 1, j0)),
v01 = read_1bit_packed(texture, uint2(i0, j0 + 1)),
v11 = read_1bit_packed(texture, uint2(i0 + 1, j0 + 1));
// bilinear interpolation
return
( ( ( (1.0 - h) * v00) + (h * v10) ) * (1.0 - v) ) +
( ( ( (1.0 - h) * v01) + (h * v11) ) * v );
}