On 04/10/2024 05:53, Bonita Montero wrote:
Am 03.10.2024 um 18:06 schrieb Vir Campestris:
for 4e9 primes my machine take nearly 10 seconds for 4294967295
primes. The one I posted before over there takes 1.5 seconds.
Show me your code. I'm pretty sure you never posted faster code.
Try this.
-- /* Programme to calculate primes using the Sieve or Eratosthenes Copyright (C) 2024 the person known as Vir Campestris This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <
https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <cassert>
#include <chrono>
#include <climits>
#include <cmath>
#include <csignal>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <list>
#include <memory>
#include <new>
#include <thread>
#include <vector>
// If these trigger you have a really odd architecture.
static_assert(sizeof(uint8_t) == 1);
static_assert(CHAR_BIT == 8);
// The type of a prime number.
// size_t isn't big enough on a 32-bit machine with 2GB memory allocated and 16 primes per byte.
typedef uint64_t PRIME;
// This is the type of the word I operate in. The code will run OK with uin8_t or uint32_t,
// but *on my computer* it's slower.
typedef uint64_t StoreType;
// tuneable constants
constexpr size_t tinyCount = 5;
constexpr size_t smallCount = 10;
constexpr size_t bigCache = 16384;
// this is a convenience class for timing.
class Stopwatch {
typedef std::chrono::steady_clock CLOCK;
typedef std::pair<unsigned, std::chrono::time_point<CLOCK> > LAP;
typedef std::vector<LAP> LAPS;
LAPS mLaps;
public:
typedef std::vector<std::pair<unsigned, double>> LAPTIMES;
Stopwatch() {}
// record the start of an interval
void start()
{
mLaps.clear();
mLaps.emplace_back(std::make_pair(0, CLOCK::now()));
}
void lap(unsigned label)
{
mLaps.emplace_back(std::make_pair(label, CLOCK::now()));
}
// record the end of an interval
void stop()
{
mLaps.emplace_back(std::make_pair(-1, CLOCK::now()));
}
// report the difference between the last start and stop
double delta() const
{
assert(mLaps.size() >= 2);
assert(mLaps.front().first == 0);
assert(mLaps.back().first == -1);
return std::chrono::duration_cast<std::chrono::nanoseconds>(mLaps.back().second - mLaps.front().second).count() / (double)1e9;
}
LAPTIMES const laps() const
{
LAPTIMES ret;
assert(mLaps.size() >= 2);
ret.reserve(mLaps.size() - 1);
auto lap = mLaps.begin();
auto start = lap->second;
for(++lap; lap != mLaps.end(); ++lap)
{
ret.emplace_back(
std::make_pair(
lap->first,
std::chrono::duration_cast<std::chrono::nanoseconds>(lap->second - start).count() / (double)1e9));
}
return ret;
}
};
// Internal class used to store the mask data for one or more
// primes rather than all primes. This has an initial block,
// followed by a block which contains data which is repeated
// for all higher values.
// Example: StoreType== uint8_t, prime == 3, the mask should be
// 90 meaning 1,3,5,7,11,13 are prime but not 9, 15
// 24,49,92 covers odds 17-63
// 24,49,92 covers 65-111, with an identical mask
// As the mask is identical we only need to store the
// non-repeat, plus one copy of the repeated block and data to
// show where it starts and ends. Note that for big primes the
// initial block may be more than one StoreType.
// As there are no common factors between any prime and the
// number of bits in StoreType the repeat is the size of the
// prime.
// Note also that a masker for two primes will have an
// initial block which is as big as the larger of the sources,
// and a repeat which is the product of the sources.
class Masker final
{
// The length of the initial block.
size_t initSize;
// The size of the repeated part. The entire store size is the sum of these two.
size_t repSize;
// Buffer management local class. I started using std::vector for the store,
// but I found it was slow for large sizes as it insists on calling the constructor
// for each item in turn. And there may be billions of them.
class Buffer final
{
StoreType* mpBuffer; // base of the data referred to
size_t mSize; // size of the buffer in StoreType-s.
public:
// Normal constructor
Buffer(size_t size = 0) : mpBuffer(nullptr), mSize(size)
{
if (size > 0)
{
mpBuffer = static_cast<StoreType*>(std::malloc(size * sizeof(StoreType)));
if (!mpBuffer)
throw std::bad_alloc();
}
}
Buffer(Buffer&) = delete;
// Move constructor
Buffer(Buffer&& source)
: mpBuffer(source.mpBuffer), mSize(source.mSize)
{
source.mSize = 0;
source.mpBuffer = nullptr;
};
Buffer& operator=(Buffer&) = delete;
// Move assignment
Buffer& operator=(Buffer&& source)
{
if (mpBuffer)
std::free(mpBuffer);
mpBuffer = source.mpBuffer;
mSize = source.mSize;
source.mSize = 0;
source.mpBuffer = nullptr;
return *this;
}
void resize(size_t newSize)
{
mpBuffer = static_cast<StoreType*>(std::realloc(mpBuffer, newSize * sizeof(StoreType)));
if (!mpBuffer)
throw std::bad_alloc();
mSize = newSize;
}
// Get the data start
inline StoreType* operator*() const
{
return mpBuffer;
}
// Get the owned size
inline size_t const & size() const
{
return mSize;
}
// clean up.
~Buffer()
{
if (mpBuffer)
std::free(mpBuffer);
}
} mBuffer;
// Raw pointer to the store for speed.
StoreType * mStorePtr;
// shift from the prime value to the index in mStore
static constexpr size_t wordshift = sizeof(StoreType) == 1 ? 4 : sizeof(StoreType) == 4 ? 6 : sizeof(StoreType) == 8 ? 7 : 0;
static_assert(wordshift);
// Mask for the bits in the store that select a prime.
static constexpr size_t wordmask = (StoreType)-1 >> (sizeof(StoreType) * CHAR_BIT + 1 - wordshift);
// convert a prime to the index of a word in the store
static inline size_t index(PRIME const & prime)
{
return prime >> wordshift;
}
// get a mask representing the bit for a prime in a word
static inline StoreType mask(PRIME const & prime)
{
return (StoreType)1 << ((prime >> 1) & wordmask);
}
public:
// This class allows us to iterate through a Masker indefinitely, retrieving words,
// automatically wrapping back to the beginning of the repat part when the end is met.
class MaskIterator
{
StoreType const * index, *repStart, *repEnd;
public:
MaskIterator(Masker const * origin)
: index(origin->mStorePtr)
, repStart(index + origin->initSize)
, repEnd(repStart + origin->repSize)
{}
inline StoreType const & operator*() const
{
return *index;
}
inline StoreType next() // returns value with iterator post-increment
{
auto & retval = *index;
if (++index >= repEnd)
index = repStart;
return retval;
}
};
// Default constructor. Makes a dummy mask for overwriting.
Masker()
: initSize(0)
, repSize(1)
, mBuffer(1)
, mStorePtr(*mBuffer)
{
*mStorePtr = 0; // nothing marked unprime
}
// Move constructor (destroys source)
Masker(Masker&& source)
: initSize(source.initSize)
, repSize(source.repSize)
, mBuffer(std::move(source.mBuffer))
, mStorePtr(*mBuffer)
{
}
// Copy constructor (preserves source)
Masker(Masker& source)
: initSize(source.initSize)
, repSize(source.repSize)
, mBuffer(initSize + repSize)
, mStorePtr(*mBuffer)
{
memcpy(*mBuffer, *source.mBuffer, mBuffer.size() * sizeof(StoreType));
}
// move assignment (destroys source)
Masker& operator=(Masker&& source)
{
initSize = source.initSize;
repSize = source.repSize;
mStorePtr = source.mStorePtr;
mBuffer = std::move(source.mBuffer);
return *this;
}
// Construct for a single prime
Masker(PRIME prime)
: initSize(this->index(prime)+1)
, repSize(prime)
, mBuffer(initSize + prime)
, mStorePtr(*mBuffer)
{
// Pre-zero the buffer, because for bigger primes we don't write every word.
// This is fast enough not to care about excess writes.
memset(mStorePtr, 0, mBuffer.size() * sizeof(StoreType));
// The max prime we'll actually store.
// *2 because we don't store evens.
PRIME const maxPrime = (initSize + repSize) * CHAR_BIT * sizeof(StoreType) * 2;
// Filter all the bits. Note that although we
// don't strictly need to filter from prime*3,
// but only from prime**2, doing from *3 makes
// the init block smaller.
PRIME const prime2 = prime * 2;
for (PRIME value = prime * 3; value < maxPrime; value += prime2)
{
mStorePtr[this->index(value)] |= this->mask(value);
}
}
// Construct from two others, making a mask that excludes all numbers
// marked not prime in either of them. The output init block
// will be the largest from either of the inputs; the repeat block size will be the
// product of the input repeat sizes.
// The output could get quite large - 3.7E12 for all primes up to 37
Masker(const Masker& left, const Masker& right, size_t storeLimit = 0)
: initSize(std::max(left.initSize, right.initSize))
, repSize (left.repSize * right.repSize)
, mBuffer()
{
if (storeLimit)
repSize = std::min(initSize + repSize, storeLimit) - initSize;
auto storeSize = initSize + repSize;
// Actually construct the store with the desired size
mBuffer = storeSize;
mStorePtr = *mBuffer;
// get iterators to the two inputs. These automatically wrap
// when their repsize is reached.
auto li = left.begin();
auto ri = right.begin();
for (size_t word = 0; word < storeSize; ++word)
{
mStorePtr[word] = li.next() | ri.next();
}
}
// Construct from several others, making a mask that excludes all numbers
// marked not prime in any of them. The output init block
// will be the largest from any of the inputs; the repeat size will be the
// product of all the input repeat sizes.
// The output could get quite large - 3.7E12 for all primes up to 37
// the type iterator should be an iterator into a collection of Masker-s.
template <typename iterator> Masker(const iterator& begin, const iterator& end, size_t storeLimit = 0)
: initSize(0)
, repSize(1)
, mBuffer() // empty for now
{
// Iterate over the inputs. We will
// * Determine the maximum init size
// * Determine the product of all the repeat sizes.
// * Record the number of primes we represent.
size_t nInputs = std::distance(begin, end);
std::vector<MaskIterator> iterators;
iterators.reserve(nInputs);
for (auto i = begin; i != end; ++i)
{
initSize = std::max(initSize, i->initSize);
repSize *= i->repSize;
iterators.push_back(i->begin());
}
if (storeLimit)
repSize = std::min(initSize + repSize, storeLimit) - initSize;
auto storeSize = initSize + repSize;
// Actually construct the store with the desired size
mBuffer = storeSize;
mStorePtr = *mBuffer;
// take the last one off (most efficient to remove)
// and use it as the initial mask value
auto last = iterators.back();
iterators.pop_back();
for (auto word = 0; word < storeSize; ++word)
{
StoreType mask = last.next();
for(auto &i: iterators)
{
mask |= i.next();
}
mStorePtr[word] = mask;
}
}
template <typename iterator> Masker& andequals(size_t cachesize, iterator begin, iterator end)
{
static constexpr size_t wordshift = sizeof(StoreType) == 1 ? 4 : sizeof(StoreType) == 4 ? 6 : sizeof(StoreType) == 8 ? 7 : 0;
static constexpr size_t wordmask = (StoreType)-1 >> (sizeof(StoreType) * CHAR_BIT + 1 - wordshift);
struct value
{
PRIME step; // this is the prime. The step should be 2P, but only half the bits are stored
PRIME halfMultiple; // this is the current multiple, _without_ the last bit
value(PRIME prime): step(prime), halfMultiple((prime*prime) >> 1) {}
bool operator<(PRIME halfLimit) const
{
return halfMultiple < halfLimit;
}
void next(StoreType * store)
{
static constexpr size_t wsm1 = wordshift - 1;
auto index = halfMultiple >> wsm1;
auto mask = (StoreType)1 << (halfMultiple & wordmask);
*(store + index) |= mask;
halfMultiple += step;
}
};
std::vector<value> values;
values.reserve(std::distance(begin, end));
for (auto prime = begin; prime != end; ++prime)
{
auto p = *prime;
values.emplace_back(p);
}
size_t limit = 0;
do
{
limit = std::min(limit + cachesize, initSize + repSize + 1);
PRIME halfMaxPrime = (limit * 16 * sizeof(StoreType)) / 2 + 1;
for (auto & value: values)
{
while (value < halfMaxPrime)
value.next(mStorePtr);
}
}
while (limit < initSize + repSize);
return *this;
}
// duplicates the data up to the amount needed for a prime newSize
void resize(PRIME newSize)
{
size_t sizeWords = this->index(newSize) + 1;
assert(sizeWords > (initSize + repSize)); // not allowed to shrink!
mBuffer.resize(sizeWords);
mStorePtr = *mBuffer;
auto copySource = mStorePtr + initSize;
do
{
auto copySize = std::min(sizeWords - repSize - initSize, repSize);
auto dest = copySource + repSize;
memcpy(dest, copySource, copySize * sizeof(StoreType));
repSize += copySize;
} while ((initSize + repSize) < sizeWords);
}
// returns true if this mask thinks value is prime.
bool get(PRIME value) const
{
assert(value < sizeof(StoreType) * CHAR_BIT * 2 * mBuffer.size());
auto ret =
(value <= 3)
|| ((value & 1) &&
(mStorePtr[this->index(value)] & this->mask(value)) == 0);
return ret;
}
// Get the beginning of the bitmap.
// Incrementing this iterator can continue indefinitely, regardless of the bitmap size.
MaskIterator begin() const
{
return MaskIterator(this);
}
size_t repsize() const
{
return repSize;
}
size_t size() const
{
return initSize + repSize;
}
// prints a collection to stderr
void dump(size_t limit = 0) const
{
std::cerr << std::dec << repSize;
std::cerr << std::hex;
if (limit == 0) limit = initSize + repSize;
auto iter = begin();
for (auto i = 0; i < limit; ++i)
{
// cast prevents attempting to print uint8_t as a char
std::cerr << ',' << std::setfill('0') << std::setw(sizeof(StoreType) * 2) << uint64_t(mStorePtr[i]);
}
std::cerr << std::endl;
std::cerr << std::dec << repSize;
for (auto p = 1; p < limit * 16 * sizeof(StoreType); ++p)
{
if (get(p))
{
std::cerr << ',' << p;
}
}
std::cerr << std::endl;
}
};
int main(int argc, char**argv)
{
// find out if the user asked for a different number of primes
PRIME PrimeCount = 1e9;
if (argc >= 2)
{
std::istringstream arg1(argv[1]);
std::string crud;
arg1 >> PrimeCount >> crud;
if (!crud.empty() || PrimeCount < 3)
{
double isitfloat;
arg1.seekg(0, arg1.beg);
crud.clear();
arg1 >> isitfloat >> crud;
if (!crud.empty() || isitfloat < 3)
{
std::cerr << "Invalid first argument \"" << argv[1] << "\" Should be decimal number of primes required\n";
exit(42);
}
else PrimeCount = isitfloat;
}
}
std::string outName;
if (argc >= 3)
{
outName = argv[2];
}
Stopwatch s; s.start();
Masker primes;
PRIME nextPrime;
{
nextPrime = 3;
Masker tiny(nextPrime);
std::vector<Masker> smallMasks;
smallMasks.emplace_back(tiny);
// Make a mask for the really small primes, 3 5 7 11
size_t count = 1; // allows for the 3
for ( ; count < tinyCount; ++count)
{
do
{
nextPrime += 2;
} while (!tiny.get(nextPrime));
tiny = Masker(tiny, nextPrime);
}
// that masker for three will be correct up until 5 squared,
// the first non-prime whose smallest root is not 2 or 3
assert (nextPrime < 25 && "If this triggers tinyCount is too big and the code will give wrong results");
// this limit is too small, but is easy to calculate and big enough
auto limit = nextPrime*nextPrime;
smallMasks.clear();
for (; count < smallCount; ++count)
{
do
{
nextPrime += 2;
} while (!tiny.get(nextPrime));
smallMasks.emplace_back(nextPrime);
}
assert(nextPrime <= limit && "If this triggers smallCount is too big and the code will give wrong results");
// Make a single masker for all the small primes. Don't make it bigger than needed.
auto small = Masker(smallMasks.begin(), smallMasks.end(), PrimeCount / (2 * CHAR_BIT * sizeof(StoreType)) + 1);
s.lap(__LINE__);
// This is the first step that takes an appreciable time. 2.5s for primes up to 1e11 on my machine.
primes = Masker(tiny, small, PrimeCount / (2 * CHAR_BIT * sizeof(StoreType)) + 1);
s.lap(__LINE__);
}
// if the buffer isn't big enough yet expand it by duplicating early entries.
if (primes.size() < PrimeCount / (2 * CHAR_BIT * sizeof(StoreType)) + 1)
{
primes.resize(PrimeCount);
s.lap(__LINE__);
}
do
{
std::vector<PRIME> quickies;
PRIME quickMaxPrime = std::min(nextPrime*nextPrime, PRIME(sqrt(PrimeCount) + 1));
do
{
do
{
nextPrime += 2;
} while (!primes.get(nextPrime));
quickies.emplace_back(nextPrime);
} while (nextPrime < quickMaxPrime);
s.lap(__LINE__);
// this function adds all the quickies into the mask, but does all of them in
// one area of memory before moving onto the next. The area of memory,
// and the list of primes, both fit into the L1 cache.
// You may need to adjust bigCache for best performance.
// This step takes a while - two lots of 20s for qe11 primes.
primes.andequals(bigCache, quickies.begin(), quickies.end());
s.lap(__LINE__);
} while (nextPrime*nextPrime < PrimeCount);
s.stop();
std::cerr << primes.size() << "," << s.laps().back().second << std::endl;
for (auto l: s.laps()) std::cerr << l.first << "," << l.second << std::endl;
if (outName.length())
{
std::ofstream outfile(outName);
outfile << "2\n";
for (PRIME prime = 3; prime < PrimeCount; prime += 2)
{
if (primes.get(prime))
{
outfile << prime << '\n';
}
}
}
return 0;
}