Newsportal USENET - Re: OT: Re: A very slow program

On 04/10/2024 05:53, Bonita Montero wrote:

Am 03.10.2024 um 18:06 schrieb Vir Campestris:

for 4e9 primes my machine take nearly 10 seconds for 4294967295
primes. The one I posted before over there takes 1.5 seconds.
Show me your code. I'm pretty sure you never posted faster code.

Try this.
--
/*
   Programme to calculate primes using the Sieve or Eratosthenes
   Copyright (C) 2024 the person known as Vir Campestris
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
   GNU General Public License for more details.
   You should have received a copy of the GNU General Public License
   along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <cassert>
#include <chrono>
#include <climits>
#include <cmath>
#include <csignal>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <list>
#include <memory>
#include <new>
#include <thread>
#include <vector>
// If these trigger you have a really odd architecture.
static_assert(sizeof(uint8_t) == 1);
static_assert(CHAR_BIT == 8);
// The type of a prime number.
// size_t isn't big enough on a 32-bit machine with 2GB memory allocated and 16 primes per byte.
typedef uint64_t PRIME;
// This is the type of the word I operate in. The code will run OK with uin8_t or uint32_t,
// but *on my computer* it's slower.
typedef uint64_t StoreType;
// tuneable constants
constexpr size_t tinyCount = 5;
constexpr size_t smallCount = 10;
constexpr size_t bigCache = 16384;
// this is a convenience class for timing.
class Stopwatch {
   typedef std::chrono::steady_clock CLOCK;
   typedef std::pair<unsigned, std::chrono::time_point<CLOCK> > LAP;
   typedef std::vector<LAP> LAPS;
   LAPS mLaps;
public:
   typedef std::vector<std::pair<unsigned, double>> LAPTIMES;
   Stopwatch() {}
   // record the start of an interval
   void start()
   {
   mLaps.clear();
   mLaps.emplace_back(std::make_pair(0, CLOCK::now()));
   }
   void lap(unsigned label)
   {
   mLaps.emplace_back(std::make_pair(label, CLOCK::now()));
   }
   // record the end of an interval
   void stop()
   {
   mLaps.emplace_back(std::make_pair(-1, CLOCK::now()));
   }
   // report the difference between the last start and stop
   double delta() const
   {
   assert(mLaps.size() >= 2);
   assert(mLaps.front().first == 0);
   assert(mLaps.back().first == -1);
   return std::chrono::duration_cast<std::chrono::nanoseconds>(mLaps.back().second - mLaps.front().second).count() / (double)1e9;
   }
   LAPTIMES const laps() const
   {
   LAPTIMES ret;
   assert(mLaps.size() >= 2);
   ret.reserve(mLaps.size() - 1);
   auto lap = mLaps.begin();
   auto start = lap->second;
   for(++lap; lap != mLaps.end(); ++lap)
   {
   ret.emplace_back(
   std::make_pair(
   lap->first,
std::chrono::duration_cast<std::chrono::nanoseconds>(lap->second - start).count() / (double)1e9));
   }
   return ret;
   }
};
// Internal class used to store the mask data for one or more
// primes rather than all primes. This has an initial block,
// followed by a block which contains data which is repeated
// for all higher values.
// Example: StoreType== uint8_t, prime == 3, the mask should be
// 90    meaning 1,3,5,7,11,13 are prime but not 9, 15
// 24,49,92    covers odds 17-63
// 24,49,92    covers 65-111, with an identical mask
// As the mask is identical we only need to store the
// non-repeat, plus one copy of the repeated block and data to
// show where it starts and ends. Note that for big primes the
// initial block may be more than one StoreType.
// As there are no common factors between any prime and the
// number of bits in StoreType the repeat is the size of the
// prime.
// Note also that a masker for two primes will have an
// initial block which is as big as the larger of the sources,
// and a repeat which is the product of the sources.
class Masker final
{
   // The length of the initial block.
   size_t initSize;
   // The size of the repeated part. The entire store size is the sum of these two.
   size_t repSize;
   // Buffer management local class. I started using std::vector for the store,
   // but I found it was slow for large sizes as it insists on calling the constructor
   // for each item in turn. And there may be billions of them.
   class Buffer final
   {
   StoreType* mpBuffer; // base of the data referred to
   size_t mSize;    // size of the buffer in StoreType-s.
public:
// Normal constructor
   Buffer(size_t size = 0) : mpBuffer(nullptr), mSize(size)
   {
   if (size > 0)
   {
   mpBuffer = static_cast<StoreType*>(std::malloc(size * sizeof(StoreType)));
   if (!mpBuffer)
   throw std::bad_alloc();
   }
   }
   Buffer(Buffer&) = delete;
   // Move constructor
   Buffer(Buffer&& source)
   : mpBuffer(source.mpBuffer), mSize(source.mSize)
   {
   source.mSize = 0;
   source.mpBuffer = nullptr;
   };
   Buffer& operator=(Buffer&) = delete;
   // Move assignment
   Buffer& operator=(Buffer&& source)
   {
   if (mpBuffer)
   std::free(mpBuffer);
   mpBuffer = source.mpBuffer;
   mSize = source.mSize;
   source.mSize = 0;
   source.mpBuffer = nullptr;
   return *this;
   }
   void resize(size_t newSize)
   {
   mpBuffer = static_cast<StoreType*>(std::realloc(mpBuffer, newSize * sizeof(StoreType)));
   if (!mpBuffer)
   throw std::bad_alloc();
   mSize = newSize;
   }
   // Get the data start
   inline StoreType* operator*() const
   {
   return mpBuffer;
   }
   // Get the owned size
   inline size_t const & size() const
   {
   return mSize;
   }
   // clean up.
   ~Buffer()
   {
   if (mpBuffer)
   std::free(mpBuffer);
   }
   } mBuffer;
   // Raw pointer to the store for speed.
   StoreType * mStorePtr;
   // shift from the prime value to the index in mStore
   static constexpr size_t wordshift = sizeof(StoreType) == 1 ? 4 : sizeof(StoreType) == 4 ? 6 : sizeof(StoreType) == 8 ? 7 : 0;
   static_assert(wordshift);
   // Mask for the bits in the store that select a prime.
   static constexpr size_t wordmask = (StoreType)-1 >> (sizeof(StoreType) * CHAR_BIT + 1 - wordshift);
   // convert a prime to the index of a word in the store
   static inline size_t index(PRIME const & prime)
   {
   return prime >> wordshift;
   }
   // get a mask representing the bit for a prime in a word
   static inline StoreType mask(PRIME const & prime)
   {
   return (StoreType)1 << ((prime >> 1) & wordmask);
   }
public:
   // This class allows us to iterate through a Masker indefinitely, retrieving words,
   // automatically wrapping back to the beginning of the repat part when the end is met.
   class MaskIterator
   {
   StoreType const * index, *repStart, *repEnd;
   public:
   MaskIterator(Masker const * origin)
   : index(origin->mStorePtr)
   , repStart(index + origin->initSize)
   , repEnd(repStart + origin->repSize)
   {}
   inline StoreType const & operator*() const
   {
   return *index;
   }
   inline StoreType next() // returns value with iterator post-increment
   {
   auto & retval = *index;
   if (++index >= repEnd)
   index = repStart;
   return retval;
   }
   };
   // Default constructor. Makes a dummy mask for overwriting.
   Masker()
   : initSize(0)
   , repSize(1)
   , mBuffer(1)
   , mStorePtr(*mBuffer)
   {
   *mStorePtr = 0; // nothing marked unprime
   }
   // Move constructor (destroys source)
   Masker(Masker&& source)
   : initSize(source.initSize)
   , repSize(source.repSize)
   , mBuffer(std::move(source.mBuffer))
   , mStorePtr(*mBuffer)
   {
   }
   // Copy constructor (preserves source)
   Masker(Masker& source)
   : initSize(source.initSize)
   , repSize(source.repSize)
   , mBuffer(initSize + repSize)
   , mStorePtr(*mBuffer)
   {
   memcpy(*mBuffer, *source.mBuffer, mBuffer.size() * sizeof(StoreType));
   }
   // move assignment (destroys source)
   Masker& operator=(Masker&& source)
   {
   initSize = source.initSize;
   repSize = source.repSize;
   mStorePtr = source.mStorePtr;
   mBuffer = std::move(source.mBuffer);
   return *this;
   }
   // Construct for a single prime
   Masker(PRIME prime)
   : initSize(this->index(prime)+1)
   , repSize(prime)
   , mBuffer(initSize + prime)
   , mStorePtr(*mBuffer)
   {
   // Pre-zero the buffer, because for bigger primes we don't write every word.
   // This is fast enough not to care about excess writes.
   memset(mStorePtr, 0, mBuffer.size() * sizeof(StoreType));
   // The max prime we'll actually store.
   // *2 because we don't store evens.
   PRIME const maxPrime = (initSize + repSize) * CHAR_BIT * sizeof(StoreType) * 2;
   // Filter all the bits. Note that although we
   // don't strictly need to filter from prime*3,
   // but only from prime**2, doing from *3 makes
   // the init block smaller.
   PRIME const prime2 = prime * 2;
   for (PRIME value = prime * 3; value < maxPrime; value += prime2)
   {
   mStorePtr[this->index(value)] |= this->mask(value);
   }
   }
   // Construct from two others, making a mask that excludes all numbers
   // marked not prime in either of them. The output init block
   // will be the largest from either of the inputs; the repeat block size will be the
   // product of the input repeat sizes.
   // The output could get quite large - 3.7E12 for all primes up to 37
   Masker(const Masker& left, const Masker& right, size_t storeLimit = 0)
   : initSize(std::max(left.initSize, right.initSize))
   , repSize (left.repSize * right.repSize)
   , mBuffer()
   {
   if (storeLimit)
   repSize = std::min(initSize + repSize, storeLimit) - initSize;
   auto storeSize = initSize + repSize;
   // Actually construct the store with the desired size
   mBuffer = storeSize;
   mStorePtr = *mBuffer;
   // get iterators to the two inputs. These automatically wrap
   // when their repsize is reached.
   auto li = left.begin();
   auto ri = right.begin();
   for (size_t word = 0; word < storeSize; ++word)
   {
   mStorePtr[word] = li.next() | ri.next();
   }
   }
   // Construct from several others, making a mask that excludes all numbers
   // marked not prime in any of them. The output init block
   // will be the largest from any of the inputs; the repeat size will be the
   // product of all the input repeat sizes.
   // The output could get quite large - 3.7E12 for all primes up to 37
   // the type iterator should be an iterator into a collection of Masker-s.
   template <typename iterator> Masker(const iterator& begin, const iterator& end, size_t storeLimit = 0)
   : initSize(0)
   , repSize(1)
   , mBuffer() // empty for now
   {
   // Iterate over the inputs. We will
   // * Determine the maximum init size
   // * Determine the product of all the repeat sizes.
   // * Record the number of primes we represent.
   size_t nInputs = std::distance(begin, end);
   std::vector<MaskIterator> iterators;
   iterators.reserve(nInputs);
   for (auto i = begin; i != end; ++i)
   {
   initSize = std::max(initSize, i->initSize);
   repSize *= i->repSize;
   iterators.push_back(i->begin());
   }
   if (storeLimit)
   repSize = std::min(initSize + repSize, storeLimit) - initSize;
   auto storeSize = initSize + repSize;
   // Actually construct the store with the desired size
   mBuffer = storeSize;
   mStorePtr = *mBuffer;
   // take the last one off (most efficient to remove)
   // and use it as the initial mask value
   auto last = iterators.back();
   iterators.pop_back();
   for (auto word = 0; word < storeSize; ++word)
   {
   StoreType mask = last.next();
   for(auto &i: iterators)
   {
   mask |= i.next();
   }
   mStorePtr[word] = mask;
   }
   }
   template <typename iterator> Masker& andequals(size_t cachesize, iterator begin, iterator end)
   {
   static constexpr size_t wordshift = sizeof(StoreType) == 1 ? 4 : sizeof(StoreType) == 4 ? 6 : sizeof(StoreType) == 8 ? 7 : 0;
   static constexpr size_t wordmask = (StoreType)-1 >> (sizeof(StoreType) * CHAR_BIT + 1 - wordshift);
   struct value
   {
   PRIME step;    // this is the prime. The step should be 2P, but only half the bits are stored
   PRIME halfMultiple;    // this is the current multiple, _without_ the last bit
   value(PRIME prime): step(prime), halfMultiple((prime*prime) >> 1) {}
   bool operator<(PRIME halfLimit) const
   {
   return halfMultiple < halfLimit;
   }
   void next(StoreType * store)
   {
   static constexpr size_t wsm1 = wordshift - 1;
   auto index = halfMultiple >> wsm1;
   auto mask = (StoreType)1 << (halfMultiple & wordmask);
   *(store + index) |= mask;
   halfMultiple += step;
   }
   };
   std::vector<value> values;
   values.reserve(std::distance(begin, end));
   for (auto prime = begin; prime != end; ++prime)
   {
   auto p = *prime;
   values.emplace_back(p);
   }
   size_t limit = 0;
   do
   {
   limit = std::min(limit + cachesize, initSize + repSize + 1);
   PRIME halfMaxPrime = (limit * 16 * sizeof(StoreType)) / 2 + 1;
   for (auto & value: values)
   {
   while (value < halfMaxPrime)
   value.next(mStorePtr);
   }
   }
   while (limit < initSize + repSize);
   return *this;
   }
   // duplicates the data up to the amount needed for a prime newSize
   void resize(PRIME newSize)
   {
   size_t sizeWords = this->index(newSize) + 1;
   assert(sizeWords > (initSize + repSize)); // not allowed to shrink!
   mBuffer.resize(sizeWords);
   mStorePtr = *mBuffer;
   auto copySource = mStorePtr + initSize;
   do
   {
   auto copySize = std::min(sizeWords - repSize - initSize, repSize);
   auto dest = copySource + repSize;
   memcpy(dest, copySource, copySize * sizeof(StoreType));
   repSize += copySize;
   } while ((initSize + repSize) < sizeWords);
   }
   // returns true if this mask thinks value is prime.
   bool get(PRIME value) const
   {
   assert(value < sizeof(StoreType) * CHAR_BIT * 2 * mBuffer.size());
   auto ret =
   (value <= 3)
   || ((value & 1) &&
   (mStorePtr[this->index(value)] & this->mask(value)) == 0);
   return ret;
   }
   // Get the beginning of the bitmap.
   // Incrementing this iterator can continue indefinitely, regardless of the bitmap size.
   MaskIterator begin() const
   {
   return MaskIterator(this);
   }
   size_t repsize() const
   {
   return repSize;
   }
   size_t size() const
   {
   return initSize + repSize;
   }
   // prints a collection to stderr
   void dump(size_t limit = 0) const
   {
   std::cerr << std::dec << repSize;
   std::cerr << std::hex;
   if (limit == 0) limit = initSize + repSize;
   auto iter = begin();
   for (auto i = 0; i < limit; ++i)
   {
   // cast prevents attempting to print uint8_t as a char
   std::cerr << ',' << std::setfill('0') << std::setw(sizeof(StoreType) * 2) << uint64_t(mStorePtr[i]);
   }
   std::cerr << std::endl;
   std::cerr << std::dec << repSize;
   for (auto p = 1; p < limit * 16 * sizeof(StoreType); ++p)
   {
   if (get(p))
   {
   std::cerr << ',' << p;
   }
   }
   std::cerr << std::endl;
   }
};
int main(int argc, char**argv)
{
   // find out if the user asked for a different number of primes
   PRIME PrimeCount = 1e9;
   if (argc >= 2)
   {
   std::istringstream arg1(argv[1]);
   std::string crud;
   arg1 >> PrimeCount >> crud;
   if (!crud.empty() || PrimeCount < 3)
   {
   double isitfloat;
   arg1.seekg(0, arg1.beg);
   crud.clear();
   arg1 >> isitfloat >> crud;
   if (!crud.empty() || isitfloat < 3)
   {
   std::cerr << "Invalid first argument \"" << argv[1] << "\" Should be decimal number of primes required\n";
   exit(42);
   }
   else PrimeCount = isitfloat;
   }
   }
   std::string outName;
   if (argc >= 3)
   {
   outName = argv[2];
   }
   Stopwatch s; s.start();
   Masker primes;
   PRIME nextPrime;
   {
   nextPrime = 3;
   Masker tiny(nextPrime);
   std::vector<Masker> smallMasks;
   smallMasks.emplace_back(tiny);
   // Make a mask for the really small primes, 3 5 7 11
   size_t count = 1;    // allows for the 3
   for ( ; count < tinyCount; ++count)
   {
   do
   {
   nextPrime += 2;
   } while (!tiny.get(nextPrime));
   tiny = Masker(tiny, nextPrime);
   }
   // that masker for three will be correct up until 5 squared,
   // the first non-prime whose smallest root is not 2 or 3
   assert (nextPrime < 25 && "If this triggers tinyCount is too big and the code will give wrong results");
   // this limit is too small, but is easy to calculate and big enough
   auto limit = nextPrime*nextPrime;
   smallMasks.clear();
   for (; count < smallCount; ++count)
   {
   do
   {
   nextPrime += 2;
   } while (!tiny.get(nextPrime));
   smallMasks.emplace_back(nextPrime);
   }
   assert(nextPrime <= limit && "If this triggers smallCount is too big and the code will give wrong results");
   // Make a single masker for all the small primes. Don't make it bigger than needed.
   auto small = Masker(smallMasks.begin(), smallMasks.end(), PrimeCount / (2 * CHAR_BIT * sizeof(StoreType)) + 1);
   s.lap(__LINE__);
   // This is the first step that takes an appreciable time. 2.5s for primes up to 1e11 on my machine.
   primes = Masker(tiny, small, PrimeCount / (2 * CHAR_BIT * sizeof(StoreType)) + 1);
   s.lap(__LINE__);
   }
   // if the buffer isn't big enough yet expand it by duplicating early entries.
   if (primes.size() < PrimeCount / (2 * CHAR_BIT * sizeof(StoreType)) + 1)
   {
   primes.resize(PrimeCount);
   s.lap(__LINE__);
   }
   do
   {
   std::vector<PRIME> quickies;
   PRIME quickMaxPrime = std::min(nextPrime*nextPrime, PRIME(sqrt(PrimeCount) + 1));
   do
   {
   do
   {
   nextPrime += 2;
   } while (!primes.get(nextPrime));
   quickies.emplace_back(nextPrime);
   } while (nextPrime < quickMaxPrime);
   s.lap(__LINE__);
   // this function adds all the quickies into the mask, but does all of them in
   // one area of memory before moving onto the next. The area of memory,
   // and the list of primes, both fit into the L1 cache.
   // You may need to adjust bigCache for best performance.
   // This step takes a while - two lots of 20s for qe11 primes.
   primes.andequals(bigCache, quickies.begin(), quickies.end());
   s.lap(__LINE__);
   } while (nextPrime*nextPrime < PrimeCount);
   s.stop();
   std::cerr << primes.size() << "," << s.laps().back().second << std::endl;
   for (auto l: s.laps()) std::cerr << l.first << "," << l.second << std::endl;
   if (outName.length())
   {
   std::ofstream outfile(outName);
   outfile << "2\n";
   for (PRIME prime = 3; prime < PrimeCount; prime += 2)
   {
   if (primes.get(prime))
   {
   outfile << prime << '\n';
   }
   }
   }
   return 0;
}

Date	Sujet	#	Auteur
15 Sep 24	A very slow program	21	Student Project
16 Sep 24	Re: A very slow program	1	Kaz Kylheku
16 Sep 24	Re: A very slow program	3	Paul
2 Oct 24	Re: A very slow program	2	Vir Campestris
2 Oct 24	Re: A very slow program	1	Paul
2 Oct 24	Re: A very slow program	14	Bonita Montero
3 Oct 24	OT: Re: A very slow program	12	Vir Campestris
4 Oct 24	Re: OT: Re: A very slow program	11	Bonita Montero
20 Oct 24	Re: OT: Re: A very slow program	10	Vir Campestris
20 Oct 24	Re: OT: Re: A very slow program	2	Kenny McCormack
26 Oct 24	Re: OT: Re: A very slow program	1	Vir Campestris
21 Oct 24	Re: OT: Re: A very slow program	6	Bonita Montero
26 Oct 24	Re: OT: Re: A very slow program	5	Vir Campestris
26 Oct 24	Re: OT: Re: A very slow program	4	Bonita Montero
26 Oct 24	Re: OT: Re: A very slow program	3	Vir Campestris
27 Oct 24	Re: OT: Re: A very slow program	2	Bonita Montero
28 Oct 24	Re: OT: Re: A very slow program	1	Vir Campestris
22 Oct 24	Re: OT: Re: A very slow program	1	Richard Harnden
3 Oct 24	Re: A very slow program	1	Paul
2 Oct 24	Re: A very slow program	2	Janis Papanagnou
2 Oct 24	Re: A very slow program	1	Keith Thompson