On 19/06/2024 01:34, Tim Rentsch wrote:
Would you mind if I asked you to post the code so I can
take a look at it? Or if you would rather email it to
me that also works.
I've hacked out all my include files, added a GPL licence, and included build instructions. It follows in the signature block. 600 lines :(
I bet the line wrapping breaks it too.
Andy
-- /* Programme to calculate primes using the Sieve or Eratosthenes Copyright (C) 2024 the person known as Vir Campestris This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <
https://www.gnu.org/licenses/>.
*/
/*
I build this programme in two ways on Linux. These commands
g++ -O3 --std=c++20 usenet.cpp -o usenet && time ./usenet
g++ -O3 --std=c++20 -D FAST usenet.cpp -o usenet && time ./usenet
compile and run it with, and without, a loop that goes around once.
It is faster when the loop is present.
*/
#include <algorithm>
#include <cassert>
#include <chrono>
#include <climits>
#include <cmath>
#include <csignal>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <list>
#include <thread>
#include <vector>
// If these trigger you have a really odd architecture.
static_assert(sizeof(uint8_t) == 1);
static_assert(CHAR_BIT == 8);
// The type of a prime number.
// size_t isn't big enough on a 32-bit machine with 2GB memory allocated and 16 primes per byte.
typedef uint64_t PRIME;
// Internal class used to store the mask data for one or more
// primes rather than all primes. This has an initial block,
// followed by a block which contains data which is repeated
// for all higher values.
// Example: StoreType== uint8_t, prime == 3, the mask should be
// 90 meaning 1,3,5,7,11,13 are prime but not 9, 15
// 24,49,92 covers odds 17-63
// 24,49,92 covers 65-111, with an identical mask
// As the mask is identical we only need to store the
// non-repeat, plus one copy of the repeated block and data to
// show where it starts and ends. Note that for big primes the
// initial block may be more than one StoreType.
// As there are no common factors between any prime and the
// number of bits in StoreType the repeat is the size of the
// prime.
// If you have an unusual architecture,
// such as ICL 1900 (24 bit word)
// or DECSystem10 (36 bit word)
// this repeat could be smaller for 3,
// but that is too much of a corner case for me to care.
// I haven't seen one of them since the 1970s.
template <typename StoreType, PRIME modulus=30> class Masker
{
// The length of the initial block.
size_t initSize;
// The size of the repeated part. The entire store size is the sum of these two.
size_t repSize;
// The actual buffer
// Allocates a zero filled buffer
class Buffer
{
StoreType* mpBuffer; // base of the data referred to
size_t mSize; // size of the buffer in StoreType-s.
public:
// Normal constructor
Buffer(size_t size = 0) : mpBuffer(nullptr), mSize(size)
{
if (size > 0)
{
mpBuffer = static_cast<StoreType*>(std::malloc(size * sizeof(StoreType)));
if (!mpBuffer)
throw std::bad_alloc();
}
}
// Move constructor
Buffer(Buffer&& source) : mpBuffer(source.mpBuffer), mSize(source.mSize)
{
source.mSize = 0;
source.mpBuffer = nullptr;
}
// Move assignment
Buffer& operator=(Buffer&& source)
{
if (mpBuffer)
std::free(mpBuffer);
mpBuffer = source.mpBuffer;
mSize = source.mSize;
source.mSize = 0;
source.mpBuffer = nullptr;
return *this;
}
void resize(size_t newSize)
{
mpBuffer = static_cast<StoreType*>(std::realloc(mpBuffer, newSize * sizeof(StoreType)));
if (!mpBuffer)
throw std::bad_alloc();
mSize = newSize;
}
// Get the data start
inline StoreType* operator*() const
{
return mpBuffer;
}
// Get the owned size
inline PRIME const & size() const
{
return mSize;
}
// clean up.
~Buffer()
{
if (mpBuffer)
std::free(mpBuffer);
}
} mBuffer;
// Raw pointer to the store for speed.
StoreType * mStorePtr;
// Lookup table for the mask bit from the modulus
static StoreType const mMaskLookup[modulus];
public:
class MaskIterator
{
StoreType const * index, *repStart, *repEnd;
public:
MaskIterator(Masker const * origin)
: index(origin->mStorePtr)
, repStart(index + origin->initSize)
, repEnd(repStart + origin->repSize)
{}
inline StoreType const & next() // returns value with iterator post-increment
{
auto const & retval = *index;
if (++index >= repEnd)
index = repStart;
return retval;
}
};
// Default constructor. Makes a dummy mask for overwriting.
Masker()
: initSize(0)
, repSize(1)
, mBuffer(1)
, mStorePtr(*mBuffer)
{
*mStorePtr = 0; // nothing marked unprime
}
// Move constructor (destroys source)
Masker(Masker&& source)
: initSize(source.initSize)
, repSize(source.repSize)
, mBuffer(std::move(source.mBuffer))
, mStorePtr(*mBuffer)
{
}
// Copy constructor (preserves source)
Masker(Masker& source)
: initSize(source.initSize)
, repSize(source.repSize)
, mBuffer(initSize + repSize)
, mStorePtr(*mBuffer)
{
memcpy(*mBuffer, *source.mBuffer, mBuffer.size() * sizeof(StoreType));
}
// move (destroys source)
Masker& operator=(Masker&& source)
{
initSize = source.initSize;
repSize = source.repSize;
mStorePtr = source.mStorePtr;
mBuffer = std::move(source.mBuffer);
return *this;
}
// Construct for a single prime
Masker(PRIME prime)
: initSize((prime/modulus)+1)
, repSize(prime)
, mBuffer(initSize + repSize)
, mStorePtr(*mBuffer)
{
// Pre-zero the buffer. For bigger primes we don't write every word.
// This is fast enough not to care about excess writes.
memset(mStorePtr, 0, mBuffer.size() * sizeof(StoreType));
// The max prime we'll store
PRIME const maxPrime = (initSize + repSize) * modulus;
// Filter all the bits. Note that although we
// don't strictly need to filter from prime*3,
// but only from prime**2, doing from *3 makes
// the init block smaller.
PRIME const prime2 = prime * 2;
PRIME const prime3 = prime * 3;
PRIME const modstep = prime % modulus;
PRIME const wordstep = prime / modulus;
PRIME mod = prime3 % modulus;
PRIME word = prime3 / modulus;
do
{
auto const & mask = mMaskLookup[mod];
if (mask)
mStorePtr[word] |= mask;
mod += modstep;
if (mod >= modulus)
{
mod -= modulus;
word += wordstep + 1;
}
else word += wordstep;
}
while (word < mBuffer.size());
}
// Construct from two others, making a mask that excludes all numbers
// marked not prime in either of them. The output init block
// will be the largest from either of the inputs; the repeat block size will be the
// product of the input repeat sizes.
// The output could get quite large - 3.7E12 for all primes up to 37
Masker(const Masker& left, const Masker& right, size_t storeLimit = 0)
: initSize(std::max(left.initSize, right.initSize))
, repSize (left.repSize * right.repSize)
, mBuffer()
{
if (storeLimit)
repSize = std::min(initSize + repSize, storeLimit) - initSize;
auto storeSize = initSize + repSize;
// Actually construct the store with the desired size
mBuffer = storeSize;
mStorePtr = *mBuffer;
auto li = left.begin();
auto ri = right.begin();
for (size_t word = 0; word < storeSize; ++word)
{
mStorePtr[word] = li.next() | ri.next();
}
}
// Construct from several others, making a mask that excludes all numbers
// marked not prime in any of them. The output init block
// will be the largest from any of the inputs; the repeat size will be the
// product of all the input repeat sizes.
// The output could get quite large - 3.7E12 for all primes up to 37
// the type iterator should be an iterator into a collection of Masker-s.
template <typename iterator> Masker(const iterator& begin, const iterator& end, size_t storeLimit = 0)
: initSize(0)
, repSize(1)
, mBuffer() // empty for now
{
// Iterate over the inputs. We will
// * Determine the maximum init size
// * Determine the product of all the repeat sizes.
// * Record the number of primes we represent.
size_t nInputs = std::distance(begin, end);
std::vector<MaskIterator> iterators;
iterators.reserve(nInputs);
for (auto i = begin; i != end; ++i)
{
initSize = std::max(initSize, i->initSize);
repSize *= i->repSize;
iterators.push_back(i->begin());
}
if (storeLimit)
repSize = std::min(initSize + repSize, storeLimit) - initSize;
auto storeSize = initSize + repSize;
// Actually construct the store with the desired size
mBuffer = storeSize;
mStorePtr = *mBuffer;
// take the last one off (most efficient to remove)
// and use it as the initial mask value
auto last = iterators.back();
iterators.pop_back();
for (auto word = 0; word < storeSize; ++word)
{
StoreType mask = last.next();
for(auto &i: iterators)
{
mask |= i.next();
}
mStorePtr[word] = mask;
}
}
template <typename iterator> Masker& andequals(size_t cachesize, iterator begin, iterator end)
{
// first is the prime itself; second is the current multiple of it _without_ the last bit
struct value
{
PRIME const modstep;
size_t const wordstep;
PRIME mod;
size_t word;
value(PRIME prime)
: modstep(prime % modulus)
, wordstep(prime / modulus)
{
PRIME const primeS = prime * prime;
mod = primeS % modulus;
word = primeS / modulus;
}
bool operator<(size_t limit) const
{
return word < limit;
}
void next(StoreType * store)
{
auto const & mask = mMaskLookup[mod];
if (mask)
store[word] |= mask;
mod += modstep;
if (mod >= modulus)
{
mod -= modulus;
word += wordstep + 1;
}
else word += wordstep;
}
};
std::vector<value> values;
values.reserve(std::distance(begin, end));
for (auto prime = begin; prime != end; ++prime)
{
values.emplace_back(*prime);
}
size_t limit = 0;
do
{
limit = std::min(limit + cachesize, initSize + repSize);
for (auto & value: values)
{
while (value < limit)
value.next(mStorePtr);
}
limit += cachesize;
}
while (limit < initSize + repSize);
return *this;
}
// duplicates the data up to the amount needed for a prime newSize
void resize(PRIME newSize)
{
size_t sizeWords = (newSize / modulus) + 1;
assert(sizeWords > (initSize + repSize)); // not allowed to shrink!
mBuffer.resize(sizeWords);
mStorePtr = *mBuffer;
auto copySource = mStorePtr + initSize;
do
{
auto copySize = std::min(sizeWords - repSize - initSize, repSize);
auto dest = copySource + repSize;
memcpy(dest, copySource, copySize * sizeof(StoreType));
repSize += copySize;
} while ((initSize + repSize) < sizeWords);
}
// returns true if this mask thinks value is prime.
bool get(PRIME value) const
{
assert(value < modulus * mBuffer.size());
if ((value <= 3) || (value == 5))
return true;
auto mod = value % modulus;
auto mask = mMaskLookup[mod];
if (mask == 0)
return false;
auto word = value / modulus;
auto ret = mStorePtr[word] & mask;
return !ret;
}
// Get the beginning of the bitmap.
// Incrementing this iterator can continue indefinitely, regardless of the bitmap size.
MaskIterator begin() const
{
return MaskIterator(this);
}
size_t size() const
{
return initSize + repSize;
}
};
template<> uint8_t const Masker<uint8_t, 30>::mMaskLookup[30] =
{
0, // 0
0x01, // 1
0, // 2
0, // 3
0, // 4
0, // 5
0, // 6
0x02, // 7
0, // 8
0, // 9
0, // 10
0x04, // 11
0, // 12
0x08, // 13
0, // 14
0, // 15
0, // 16
0x10, // 17
0, // 18
0x20, // 19
0, // 20
0, // 21
0, // 22
0x40, // 23
0, // 24
0, // 25
0, // 26
0, // 27
0, // 28
0x80 // 29
};
typedef uint8_t StoreType;
typedef Masker<StoreType> Mask;
#ifndef tinyCount
constexpr size_t tinyCount = 7;
#endif
#ifndef smallCount
constexpr size_t smallCount = 10;
#endif
#ifndef bigCache
constexpr size_t bigCache = 0x80000;
#endif
int main(int argc, char**argv)
{
// find out if the user asked for a different number of primes
PRIME PrimeCount = 1e9;
if (argc >= 2)
{
std::istringstream arg1(argv[1]);
std::string crud;
arg1 >> PrimeCount >> crud;
if (!crud.empty() || PrimeCount < 3)
{
double isitfloat;
arg1.seekg(0, arg1.beg);
crud.clear();
arg1 >> isitfloat >> crud;
if (!crud.empty() || isitfloat < 3)
{
std::cerr << "Invalid first argument \"" << argv[1] << "\" Should be decimal number of primes required\n";
exit(42);
}
else PrimeCount = isitfloat;
}
}
std::string outName;
if (argc >= 3)
{
outName = argv[2];
}
#ifdef FAST
for (size_t bigCache = 0x80000; bigCache < 0x80001; ++bigCache)
#endif
{
Mask primes;
PRIME nextPrime;
{
nextPrime = 7;
Mask tiny(nextPrime);
std::vector<Mask> smallMasks;
smallMasks.emplace_back(tiny);
// Make a mask for the really small primes
size_t count = 1; // allows for the 3
for ( ; count < tinyCount; ++count)
{
do
{
nextPrime += 2;
} while (!tiny.get(nextPrime));
tiny = Mask(tiny, nextPrime);
}
// this limit is too small, but is easy to calculate and big enough
auto limit = nextPrime*nextPrime;
smallMasks.clear();
for (; count < smallCount; ++count)
{
do
{
nextPrime += 2;
} while (!tiny.get(nextPrime));
smallMasks.emplace_back(nextPrime);
}
assert(nextPrime <= limit && "If this triggers smallCount is too big and the code will give wrong results");
// Make a single masker for all the small primes. Don't make it bigger than needed.
auto small = Mask(smallMasks.begin(), smallMasks.end(), PrimeCount / 30 + 1);
primes = Mask(tiny, small, PrimeCount / 30 + 1);
}
// if the buffer isn't big enough yet expand it by duplicating early entries.
if (primes.size() < PrimeCount / 30 + 1)
{
primes.resize(PrimeCount);
}
do
{
std::vector<PRIME> quickies;
PRIME quickMaxPrime = std::min(nextPrime*nextPrime, PRIME(sqrt(PrimeCount) + 1));
do
{
do
{
nextPrime += 2;
} while (!primes.get(nextPrime));
quickies.emplace_back(nextPrime);
} while (nextPrime < quickMaxPrime);
// this function adds all the quickies into the mask, but does all of them in
// one area of memory before moving onto the next. The area of memory,
// and the list of primes, both fit into the L1 cache.
// You may need to adjust bigCache for best performance.
primes.andequals(bigCache, quickies.begin(), quickies.end());
} while (nextPrime*nextPrime < PrimeCount);
std::cerr << "tinyCount " << tinyCount
<< " smallCount " << smallCount << " bigCache "
<< std::hex << "0x" << bigCache << std::endl;
if (outName.length())
{
std::ofstream outfile(outName);
outfile << "2\n";
for (PRIME prime = 3; prime < PrimeCount; prime += 2)
{
if (primes.get(prime))
{
outfile << prime << '\n';
}
}
}
}
return 0;
}