performance of + and *, suggested improvement

I'm opening a separate issue, about the things I mentioned in https://github.com/scheinerman/Mods.jl/issues/25. This is done, so that my suggestion of improvement is a bit more visible, for later reference, and to give some benchmarks. 

The problem is the slow computation of matrix products:
```
julia> using SaferIntegers, Mods
julia> begin n=10^3;  m=11;
       x = Int.(      rand(-9:9,(n,n)));  @time x*x; 
       y = SafeInt64.(rand(-9:9,(n,n)));  @time (y*y) .%m;
       z = Mod{m}.(   rand(-9:9,(n,n)));  @time z*z; end;
  0.434071 seconds (5 allocations: 7.659 MiB)
  1.476792 seconds (9 allocations: 15.289 MiB)
538.278712 seconds (6.17 G allocations: 114.895 GiB, 35.82% gc time)
```

My suggestion is to use the following code, which will hopefully be integrated (eventually, in some form) into Mods.jl: 
```
struct Mod8{m}
    val ::UInt8 
    function Mod8{m}(val ::tw) where {m,tw <:UInt8}
        return new{m}(mod(val,m)) end end;  # Note: mod(val,m) is always positive, but rem(val,m) = val % m preserves the sign.
struct Mod16{m}
    val ::UInt16
    function Mod16{m}(val ::tw) where {m,tw <:UInt16}
        return new{m}(mod(val,m)) end end;
struct Mod32{m}
    val ::UInt32
    function Mod32{m}(val ::tw) where {m,tw <:UInt32}
        return new{m}(mod(val,m)) end end;
struct Mod64{m}
    val ::UInt64
    function Mod64{m}(val ::tw) where {m,tw <:UInt64}
        return new{m}(mod(val,m)) end end;
struct Mod128{m}
    val ::UInt128
    function Mod128{m}(val ::tw) where {m,tw <:UInt128}
        return new{m}(mod(val,m)) end end;
function modIntType(m ::Integer) ::DataType
    #= Create a type that has enough space so that overflow does not happen. 
    The condition m ≤ sqrt(typemax(UIntN)) makes sure that multiplication val*val is never larger than the underlying UIntN. =#
    @assert 2 ≤ m  
    if     m ≤ sqrt(typemax(UInt8))     return Mod8{  UInt8(  m)}
    elseif m ≤ sqrt(typemax(UInt16))    return Mod16{ UInt16( m)}
    elseif m ≤ sqrt(typemax(UInt32))    return Mod32{ UInt32( m)}
    elseif m ≤ sqrt(typemax(UInt64))    return Mod64{ UInt64( m)}
    elseif m ≤ sqrt(typemax(UInt128))   return Mod128{UInt128(m)}
    else error("modulus m must be in [2,...,2^128-1]") end end;
for T ∈ (Mod8,Mod16,Mod32,Mod64,Mod128)
    for tw ∈ (Int8,Int16,Int32,Int64,Int128,BigInt,SafeInt8,SafeInt16,SafeInt32,SafeInt64,SafeInt128)
        T{m}(val ::tw) where m = T{m}(typeof(m)(mod(val,m))) end;
    Base.zero(::T{m}) where m = T{m}(typeof(m)(0));
    Base.one( ::T{m}) where m = T{m}(typeof(m)(1));
    Base.:+(x ::T{m}) where m = x;
    Base.:-(x ::T{m}) where m = T{m}(m - x.val);
    Base.:+(x ::T{m}, y ::T{m}) where m = T{m}(x.val + y.val);
    Base.:-(x ::T{m}, y ::T{m}) where m = T{m}(x.val + m - y.val);
    Base.:*(x ::T{m}, y ::T{m}) where m = T{m}(x.val * y.val);
    Base.inv(x ::T{m}) where m = T{m}(invmod(x.val, m)); 
    Base.:^(x ::T{m}, k ::Integer) where m = T{m}(powermod(x.val, k, m)); 
    Base.:*(x ::T{m}, y ::Integer) where m = *(x, T{m}(y));
    Base.:*(x ::Integer, y ::T{m}) where m = *(T{m}(x), y);
    function Base.show(io ::IO, x ::T) ::Nothing  
        print(io, BigInt(x.val)) end; end;
```
Notice that no overflow occurs in `+` and `*`, since in `Mod8`, the values go from 0,...,2^8-1 but modulus is at most 2^4-1, in `Mod16`, the values go from 0,...,2^16-1 but modulus is at most 2^8-1, etc.

To compare the efficiency of my implementation to existing ones (and Float's and Int's), consider these benchmarks:
```
julia> using Mods, FiniteFields, GaloisFields, AbstractAlgebra, Nemo, Singular

julia> for T ∈ [modIntType(11), GaloisFields.GaloisField(11), FiniteFields.FFE{11}, AbstractAlgebra.GF(11), Singular.residue_ring(Singular.ZZ, 11), Nemo.FiniteField(11)[1]]
           n=10^3;  x = rand(T.(0:10),(n,n));  print(sizeof(x),"   ");  @time x*x; end
1000000     3.665314 seconds (5 allocations: 1008.172 KiB)
1000000     7.270208 seconds (5 allocations: 1008.172 KiB)
4000000    17.841132 seconds (5 allocations: 3.845 MiB)
16000000   19.344726 seconds (2 allocations: 15.259 MiB, 0.18% gc time)
8000000   682.773657 seconds (3.83 G allocations: 86.878 GiB, 20.13% gc time)
16000000   12.845152 seconds (2 allocations: 15.259 MiB)

julia> for T ∈ [Float64, Int64, SafeInt64, modIntType(11), GaloisFields.GaloisField(11), FiniteFields.FFE{11}, AbstractAlgebra.GF(11), Nemo.FiniteField(11)[1]]
           n=10^4;  x = rand(T.(0:10),(n,n));  print(sizeof(x),"   ");  @time x*x; end
800000000       15.452075 seconds (2 allocations: 762.939 MiB, 0.02% gc time)
800000000      477.965464 seconds (5 allocations: 762.970 MiB, 0.00% gc time)
800000000     1429.740835 seconds (5 allocations: 762.970 MiB, 0.00% gc time)
100000000     3684.688773 seconds (5 allocations: 95.398 MiB, 0.00% gc time)
100000000     7075.917578 seconds (5 allocations: 95.398 MiB, 0.00% gc time)
400000000    16646.365626 seconds (5 allocations: 381.500 MiB, 0.00% gc time)
1600000000   52944.319672 seconds (2 allocations: 1.490 GiB, 0.00% gc time)
1600000000   28602.384860 seconds (2 allocations: 1.490 GiB, 0.00% gc time)
```
As we can see, my implementation is the most performant of the 5 available ones. It still needs 1h to compute the product of 10Kx10K matrices, but the other methods need 2, 5, 14, 8 h, respectively. It is frustrating since the multiplication of Floats requires mere 16 seconds. My hope is that one day, Octavian.jl or Tullio.jl might be used, to speed up this calculation. 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

performance of + and *, suggested improvement #31

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

performance of + and *, suggested improvement #31

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions