3 files changed, 28 insertions, 34 deletions
diff --git a/docs/crandom_api.md b/docs/crandom_api.md
index 88924784..c6491243 100644
--- a/docs/crandom_api.md
+++ b/docs/crandom_api.md
@@ -8,27 +8,23 @@ See [random](https://en.cppreference.com/w/cpp/header/random) for similar c++ fu
 
 ## Description
 
-**crand64** is a novel, very fast PRNG, suited for parallel usage. It features a
-Weyl-sequence as part of its state. It is based on *sfc64*, but has a different output function
-and state size.
+**crand64** is a very fast PRNG, suited for parallel usage. It is based on *sfc64*, but has a
+different output function and state size. It features a Weyl-sequence as part of its state. 
 
-**sfc64** is the fastest among *pcg*, *xoshiro`**`*, and *lehmer*. It is equally fast or faster than
-*sfc64* on most platforms. *wyrand* is faster on platforms with fast 128-bit multiplication, and has
-2^64 period (https://github.com/lemire/SwiftWyhash/issues/10). *wyrand* is not suited for massive
-parallel usage due to its limited minimal period.
+**crand64** is faster or equally fast as *wyrand*, *xoshiro\*\**, *sfc64*, and *romu_trio*
+with both **clang 16.0** and **gcc 13.1** from the [prng_bench.c](../misc/benchmarks/various/prng_bench.cpp)
+on windows 11, Ryzen 7 5700X. (clang does not optimize *xoshiro\*\** and *sfc64* as well as gcc does).
 
-**crand64** does not require multiplication or 128-bit integer operations. It has 320 bit state,
-where 64-bits are constant per prng instance created.
-
-There is no *jump function*, but each odd number Weyl-increment (state[4]) starts a new
+**crand64** has no jump *function*, but each odd number Weyl-increment (state[4]) starts a new
 unique 2^64 *minimum* length period, i.e. virtually unlimitied number of unique threads.
+In contrast, *wyrand* and *sfc64* have only a (total) minimum period of 2^64 (*romu_trio* has
+no guarantees), and may therefore not be suited for massive parallel usage (for purists).
 
-**crand64** passes *PractRand* (tested up to 8TB output), Vigna's Hamming weight test, and simple
-correlation tests, i.e. *n* interleaved streams with only one-bit differences in initial state.
-Also 32-bit and 16-bit versions passes PractRand up to their size limits.
+**crand64** does not require multiplication or 128-bit integer operations. It has 320 bit state,
+where 64-bits are constant per instance.
 
-For more, see the PRNG shootout by Vigna: http://prng.di.unimi.it and a debate between the authors of
-xoshiro and pcg (Vigna/O'Neill) PRNGs: https://www.pcg-random.org/posts/on-vignas-pcg-critique.html
+**crand64** passes *PractRand* (tested up to 8TB output), Vigna's Hamming weight test, and simple
+correlation tests. The 16- and 32-bit variants also passes PractRand up to their size limits.
 
 ## Header file
 
diff --git a/misc/benchmarks/various/prng_bench.cpp b/misc/benchmarks/various/prng_bench.cpp
index 234e3805..cd43ff36 100644
--- a/misc/benchmarks/various/prng_bench.cpp
+++ b/misc/benchmarks/various/prng_bench.cpp
@@ -66,7 +66,7 @@ uint32_t pcg32(uint32_t s[2]) {
 }
 
 
-/* xoshiro128+  */
+/* xo(ro)shiro  */
 
 uint64_t xoroshiro128plus(uint64_t s[2]) {
     const uint64_t s0 = s[0];
@@ -80,9 +80,6 @@ uint64_t xoroshiro128plus(uint64_t s[2]) {
     return result;
 }
 
-
-/* xoshiro256**  */
-
 static inline uint64_t xoshiro256starstar(uint64_t s[4]) {
     const uint64_t result = rotl64(s[1] * 5, 7) * 9;
     const uint64_t t = s[1] << 17;
@@ -95,7 +92,7 @@ static inline uint64_t xoshiro256starstar(uint64_t s[4]) {
     return result;
 }
 
-// wyrand - 2020-12-07
+/* wyrand - 2020-12-07 */
 static inline void _wymum(uint64_t *A, uint64_t *B){
 #if defined(__SIZEOF_INT128__)
     __uint128_t r = *A; r *= *B;
@@ -136,44 +133,44 @@ int main(void)
     for (size_t ti = 0; ti < 2; ti++) {
         init_state(rng.state, 12345123);
         cout << endl << "ROUND " << ti+1 << " ---------" << endl;
-
+/*
         beg = clock();
         for (size_t i = 0; i < N; i++)
-            recipient[i] = romu_trio(rng.state);
+            recipient[i] = sfc32((uint32_t *)rng.state);
         end = clock();
-        cout << "romu_trio:\t"
+        cout << "sfc32:\t\t"
              << (float(end - beg) / CLOCKS_PER_SEC)
              << "s: " << recipient[312] << endl;
 
         beg = clock();
         for (size_t i = 0; i < N; i++)
-            recipient[i] = wyrand64(rng.state);
+            recipient[i] = stc32((uint32_t *)rng.state);
         end = clock();
-        cout << "wyrand64:\t"
+        cout << "stc32:\t\t"
              << (float(end - beg) / CLOCKS_PER_SEC)
              << "s: " << recipient[312] << endl;
 
         beg = clock();
         for (size_t i = 0; i < N; i++)
-            recipient[i] = sfc32((uint32_t *)rng.state);
+            recipient[i] = pcg32((uint32_t *)rng.state);
         end = clock();
-        cout << "sfc32:\t\t"
+        cout << "pcg32:\t\t"
              << (float(end - beg) / CLOCKS_PER_SEC)
              << "s: " << recipient[312] << endl;
-
+*/
         beg = clock();
         for (size_t i = 0; i < N; i++)
-            recipient[i] = stc32((uint32_t *)rng.state);
+            recipient[i] = romu_trio(rng.state);
         end = clock();
-        cout << "stc32:\t\t"
+        cout << "romu_trio:\t"
              << (float(end - beg) / CLOCKS_PER_SEC)
              << "s: " << recipient[312] << endl;
 
         beg = clock();
         for (size_t i = 0; i < N; i++)
-            recipient[i] = pcg32((uint32_t *)rng.state);
+            recipient[i] = wyrand64(rng.state);
         end = clock();
-        cout << "pcg32:\t\t"
+        cout << "wyrand64:\t"
              << (float(end - beg) / CLOCKS_PER_SEC)
              << "s: " << recipient[312] << endl;
 
diff --git a/misc/examples/smartpointers/arcvec_erase.c b/misc/examples/smartpointers/arcvec_erase.c
index ba54c1c7..9d757533 100644
--- a/misc/examples/smartpointers/arcvec_erase.c
+++ b/misc/examples/smartpointers/arcvec_erase.c
@@ -19,7 +19,8 @@ int main(void)
     
     // clone the second 2012 and push it back.
     // note: cloning make sure that vec.data[2] has ref count 2.
-    Vec_push(&vec, Arc_clone(vec.data[2]));
+    Vec_push(&vec, Arc_clone(vec.data[2]));  // => share vec.data[2]
+    Vec_emplace(&vec, *vec.data[2].get);     // => deep-copy vec.data[2]
     
     printf("vec before erase :");
     c_foreach (i, Vec, vec)