Does someone see a way to speed this up
I am using a lot of function calls, would passing constant pointers speed up my program significantly? I am doing computations with modular forms (2 variable functions, written as a Fourier series in one variable). I am basically computing integrals of these functions and searching for maximums. The program takes a long time to run, I think much longer than it should. I don't expect people to read the code, just look at the way I am using cln. Is this an efficient way? #include <iostream.h> #include <fstream.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <cln/real.h> #include <cln/integer.h> #include <iostream.h> #include <fstream.h> #include <cln/io.h> #include <cln/integer_io.h> #include <cln/real_io.h> #include <cln/float_io.h> #include <cln/float_io.h> #include <cln/integer.h> #include <cln/real.h> #include <cln/complex.h> // We do I/O. #include <cln/io.h> #include <cln/integer_io.h> #include <cln/real_io.h> #include <cln/complex_io.h> // We use the timing functions. #include <cln/timing.h> //using namespace std; using namespace cln; typedef cln::cl_I bigI; typedef cln::cl_F bigF; typedef cln::cl_N bigC; typedef cln::cl_R bigR; bigF h(bigF y, bigF a[], int k, int M, int pr); bigF ff(bigF x, bigF y, bigF a[], int k, int M, int pr); bigF v(bigF y, bigF a[], int k, int M, int pr); bigF integralH(bigF a[], int k, int M, int n, int pr); bigF integralL(bigF a[], int k, int M, int n, int pr); bigF max(bigF a[][100],bigF b[], int k, int D, int M, bigF e, int pr); //for each weight k (2k gives actual weight), make 2d arrary of coefficients //write routine to search FD domain (less than 2k/pi, check the 2), first do //sup for indiv forms, than the average one. int main(int argc, char *argv[]) { //the number of coefficients to use int pbits= 128; int nPartitions = 100; int divisor = 4; float fbinsize = 0.1; int startK = 22; cout << "\n Enter prec, numPartitions, intervalSize, divisor to reduce coef, startK "; cin >> pbits >> nPartitions >> fbinsize >> divisor >> startK; std::string snum; char w; int K,P,D; bigI S, H; std::ifstream myfile ("full_output.txt"); if (myfile.is_open()) { while (! myfile.eof() ) { D = 0; myfile >> w >> K >> P >> D; bigI bigInt; int pr = pbits; cln::float_format_t prec = cln::float_format_t(pr); bigF binSize = cl_float(fbinsize,prec); bigF a[D][100]; bigF b[D]; { for (int n = 0; n < D; n++) { for (int u = 0; u < P; u++) { myfile >> S >> bigInt; a[n][u] = cl_float(bigInt,prec); } if (K >= startK) { b[n] = integralH(a[n],K, P/divisor, nPartitions, pr) + integralL(a[n],K, P/divisor, nPartitions, pr); } //b[n] = cl_float(1,prec); //cout << "\nLoaded Coeff"; } if (K >= startK) { if (D>0) { bigF sNorm = max(a,b,K,D,P, binSize, pr); cout << "\n" << K << "," << sNorm; } } } } myfile.close(); } else cout << "Unable to open file"; return 0; } //Used to do the integral over cylindar y > 1 bigF h(bigF y, bigF a[], int k, int M, int pr) { cln::float_format_t prec = cln::float_format_t(pr); const bigF PI = pi(prec); bigF s = cln::cl_float(0,prec); for (int n = 1; n < M; n++) if (cl_float(4,prec)*PI*cl_float(n,prec)*y < cl_float(300,prec)) s = s + a[n]*a[n]*exp(cl_float(-4,prec)*PI*cl_float(n,prec)*y); return exp(ln(y)*cl_float(2*k-2, prec))*s; } //ff is equiv to g, use it as a test and see which is more effiecient bigF ff(bigF x, bigF y, bigF a[], int k, int M, int pr) { cln::float_format_t prec = cln::float_format_t(pr); const bigF PI = pi(prec); bigF q = cl_float(0.0,prec); int j = 1; bigF w = cl_float(0.0,prec); int r = 0; bigF hh = cl_float(0,prec); for (j = 1; j <= M; j++) { if (cl_float(4,prec)*PI*cl_float(j,prec)*y < cl_float(50,prec)) w = w + a[j]*a[j]*exp(-cl_float(4,prec)*PI*cl_float(j,prec)*y); } for (r = 3; r <= M; r++) { q = cl_float(0.0,prec); for (j = 1; 2*j < r; j++) q = q + a[j]*a[r-j]*cos(cl_float(2,prec)*PI*(cl_float(2,prec)*cl_float(j,prec) - cl_float(r,prec))*x); if (cl_float(2,prec)*PI*cl_float(r,prec)*y < cl_float(50,prec)) q = q*exp(cl_float(-2,prec)*PI*cl_float(r,prec)*y); else q = cl_float(0,prec); w = w + 2*q; } bigF tt = cln::cl_float(2*k,prec); return exp(ln(y)*tt)*w; } bigF max(bigF a[][100], bigF b[], int k, int D, int M, bigF e, int pr) { bigF x,y,half; cln::float_format_t prec = cln::float_format_t(pr); bigF one = cl_float(1,prec); x = cl_float(0,prec); y = cl_float(1,prec); half = cl_float(0.5,prec); bigF K = cl_float(k,prec); bigF mxT = cl_float(0,prec); bigF sum = cl_float(0,prec); bigF w = sqrt(cl_float(3,prec))/cl_float(2,prec); while ( x < half) { y = cl_float(1,prec); while (y < K/cl_float(6,prec)) { sum = cl_float(0,prec); for (int i = 0; i < D; i++) sum = sum + ff(x,y,a[i],k,M,pr)/b[i]; if (sum > mxT) mxT = sum; y = y + e; } x = x + e; } bigF mxB = cl_float(0,prec); x = cl_float(0,prec); while ( x < half) { y = sqrt(cl_float(1) - x*x); while (y < one) { sum = cl_float(0,prec); for (int i = 0; i < D; i++) sum = sum + ff(x,y,a[i],k,M,pr)/b[i]; if (sum > mxB) mxB = sum; y = y + e; } x = x + e; } if (mxB > mxT) return mxB; return mxT; } //twice the Integral f(z) square in x direction from 0 to 1/2; //y is variable from sqrt(3)/2 to 1 bigF v(bigF y, bigF a[], int k, int M, int pr) { cln::float_format_t prec = cln::float_format(pr); const bigF PI = pi(prec); bigF q = cl_float(0.0, prec); bigF w = cl_float(0.0, prec); bigF one = cl_float(1, prec); bigF two = cl_float(2, prec); bigF three = cl_float(3, prec); bigF four = cl_float(4, prec); for (int j = 1; j <= M; j++) { bigF J = cl_float(j,prec); if (four*PI*J*y < cl_float(50,prec)) w = w + a[j]*a[j]*exp(-four*PI*J*y); } w = two*(cl_float(0.5,prec) - sqrt(one-y*y))*w; for (int r = 3; r <= M; r++) { bigF R = cl_float(r,prec); q = cl_float(0,prec); for (int j = 1; 2*j < r; j++) { bigF J = cl_float(j,prec); q = q + a[j]*a[r-j]/(two*PI*(two*J - R))* (sin(PI*(two*J - R)) - sin(two*PI*(two*J - R)*sqrt(one-y*y))); } //q = q + a[j]*a[r-j]*cos(2*PI*(2*j - r)*x); if (two*PI*R*y < cl_float(50,prec)) q = q*exp(-two*PI*R*y); else q = cl_float(0,prec); w = w + two*q; } //for (int n = 1; n < h; n++) s = s + a[n]*exp(2*PI*I*z*n); bigF tt = cln::cl_float((float)(2*k-2),prec); return exp(ln(y)*tt)*w; } bigF integralH(bigF a[], int k, int M, int n, int pr) { const bigF PI = "3.14159265358979323846264338327950288"; cln::float_format_t prec = cln::float_format_t(pr); bigF q = cl_float(0.0, prec); bigF w = cl_float(0.0, prec); bigF aa = cl_float(0.0, prec); bigF b = cl_float(0.0, prec); bigF d = cl_float(0.0, prec); bigF one = cl_float(1, prec); bigF two = cl_float(2, prec); bigF three = cl_float(3, prec); bigF four = cl_float(4, prec); bigF five = cl_float(5, prec); aa = one; //modify this later b = cl_float(k,prec)/two; d = (b-aa)/cl_float(n,prec); //f(x) is the dummy function q = h(aa,a,k,M, pr) + h(b,a,k,M, pr); for (int j = 1; 2*j <= n-2; j++) w = w + h(aa + (two*cl_float(j,prec))*d,a,k,M, pr); q = q + two*w; w = cl_float(0,prec); for (int j = 1; 2*j <= n; j++) w = w + h(aa + (two*cl_float(j,prec)-one)*d,a,k,M, pr); q = q + four*w; q = q*d/three; return q; } bigF integralL(bigF a[],int k, int M, int n, int pr) { cln::float_format_t prec = cln::float_format(pr); bigF q = cl_float(0.0, prec); bigF w = cl_float(0.0, prec); bigF aa = cl_float(0.0, prec); bigF b = cl_float(1.0, prec); bigF d = cl_float(0.0, prec); bigF one = cl_float(1, prec); bigF two = cl_float(2, prec); bigF three = cl_float(3, prec); bigF four = cl_float(4, prec); bigF five = cl_float(5, prec); aa = sqrt(three)/two; b = one; d = (b-aa)/cl_float(n,prec); //f(x) is the dummy function q = v(aa,a,k,M, pr) + v(b,a,k,M, pr); for (int j = 1; 2*j <= n-2; j++) w = w + v(aa + two*cl_float(j,prec)*d,a,k,M, pr); q = q + two*w; w = cl_float(0,prec); for (int j = 1; 2*j <= n; j++) w = w + v(aa + (two*cl_float(j,prec)-one)*d,a,k,M, pr); q = q + four*w; q = q*d/three; return q; } -- Joshua Friedman PhD CrownEagle@gmail.com http://www.math.sunysb.edu/~joshua
Hi Joshua,
I am using a lot of function calls, would passing constant pointers speed up my program significantly? ... I don't expect people to read the code, just look at the way I am using cln. Is this an efficient way?
Based on past experience, not on your program in particular, here's a list of suggestions, starting with the most promising ones, and the least promising ones at the end. 1) Use a profiler that displays the bottlenecks and relative consumption of CPU time. On Linux/x86, the tool of choice is valgrind <http://valgrind.org/docs/manual/cl-manual.html>, together with kcachegrind for the visualization of the results. 2) Are you using excess precision in large parts of the computations? Often, in order to get 20 digits of a result, you need 40 digits throughout the computation if you choose a constant precision. But sometimes you can actually do part of the computation with 22 digits and part with 40 digits, and the result will still have 20 digits of accuracy. For analyzing this, you need to have a certain understanding of the numerical stability of your algorithm. For example, when you are evaluating integrals along a path around a pole, you use 40 digits in the computation and get 20 digits accuracy in the result; but when you are evaluating an integrals along a path around a region with no singularity, then 40 digits in the computation normally lead to slightly less than 40 digits accuracy in the result. 3) When computing integrals, use higher-order integration methods than simple averaged summation, if the integrand has continuous derivates. If your integrand is infinitely differentiable, then Runge-Kutta (the formula with 1/6, 4/6, 1/6) is a big win, and higher-order Runge-Kutta methods are even better. 4) Pull calls to transcendental functions out of loops if possible. In CLN, elementary operations and sqrt are fast, whereas sin, exp, log, etc. are significantly slower. 5) Pull common computations out of loops. The compiler won't do it for it. Things like cl_float(2,prec)*PI are quite fast to compute, but you may save a couple of percent by moving all these "trivial" common subexpressions outside the loops. 6) Then only, start to worry about whether you pass numbers by "copy" (e.g. cl_N x) or by reference (e.g. const cl_N& x). CLN numbers are references anyway, so by doing this you can save the increment and decrement of a reference count, nothing else. If you had many function calls and many small-sized bignums and floats, you could save maybe at most 20% by using this trick. It's more a micro-optimization than the other five points. Bruno
participants (2)
-
Bruno Haible
-
Joshua Friedman