NFFT  3.5.3
fpt.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002, 2017 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
25 #include "config.h"
26 
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdbool.h>
30 #include <math.h>
31 #ifdef HAVE_COMPLEX_H
32 #include <complex.h>
33 #endif
34 
35 #include "nfft3.h"
36 #include "infft.h"
37 
43 #undef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
44 
45 /* Macros for index calculation. */
46 
48 #define K_START_TILDE(x,y) (MAX(MIN(x,y-2),0))
49 
51 #define K_END_TILDE(x,y) MIN(x,y-1)
52 
54 #define FIRST_L(x,y) (LRINT(floor((x)/(double)y)))
55 
57 #define LAST_L(x,y) (LRINT(ceil(((x)+1)/(double)y))-1)
58 
59 #define N_TILDE(y) (y-1)
60 
61 #define IS_SYMMETRIC(x,y,z) (x >= ((y-1.0)/z))
62 
63 #define FPT_BREAK_EVEN 4
64 
65 #include "fpt.h"
66 
67 
68 static inline void abuvxpwy(double a, double b, double _Complex* u, double _Complex* x,
69  double* v, double _Complex* y, double* w, int n)
70 {
71  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
72  double *v_ptr = v, *w_ptr = w;
73  for (l = 0; l < n; l++)
74  *u_ptr++ = a * (b * (*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
75 }
76 
77 #define ABUVXPWY_SYMMETRIC(NAME,S1,S2) \
78 static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
79  double* v, double _Complex* y, double* w, int n) \
80 { \
81  const int n2 = n>>1; \
82  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
83  double *v_ptr = v, *w_ptr = w; \
84  for (l = 0; l < n2; l++) \
85  *u_ptr++ = a * (b * (*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++)); \
86  v_ptr--; w_ptr--; \
87  for (l = 0; l < n2; l++) \
88  *u_ptr++ = a * (b * S1 * (*v_ptr--) * (*x_ptr++) + S2 * (*w_ptr--) * (*y_ptr++)); \
89 }
90 
91 ABUVXPWY_SYMMETRIC(abuvxpwy_symmetric1,1.0,-1.0)
92 ABUVXPWY_SYMMETRIC(abuvxpwy_symmetric2,-1.0,1.0)
93 
94 #define ABUVXPWY_SYMMETRIC_1(NAME,S1) \
95 static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
96  double* v, double _Complex* y, int n, double *xx) \
97 { \
98  const int n2 = n>>1; \
99  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
100  double *v_ptr = v, *xx_ptr = xx; \
101  for (l = 0; l < n2; l++, v_ptr++) \
102  *u_ptr++ = a * (b * (*v_ptr) * (*x_ptr++) + ((*v_ptr)*(1.0+*xx_ptr++)) * (*y_ptr++)); \
103  v_ptr--; \
104  for (l = 0; l < n2; l++, v_ptr--) \
105  *u_ptr++ = a * (b * S1 * (*v_ptr) * (*x_ptr++) + (S1 * (*v_ptr) * (1.0+*xx_ptr++)) * (*y_ptr++)); \
106 }
107 
108 ABUVXPWY_SYMMETRIC_1(abuvxpwy_symmetric1_1,1.0)
109 ABUVXPWY_SYMMETRIC_1(abuvxpwy_symmetric1_2,-1.0)
110 
111 #define ABUVXPWY_SYMMETRIC_2(NAME,S1) \
112 static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
113  double _Complex* y, double* w, int n, double *xx) \
114 { \
115  const int n2 = n>>1; \
116  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
117  double *w_ptr = w, *xx_ptr = xx; \
118  for (l = 0; l < n2; l++, w_ptr++) \
119  *u_ptr++ = a * (b * (*w_ptr/(1.0+*xx_ptr++)) * (*x_ptr++) + (*w_ptr) * (*y_ptr++)); \
120  w_ptr--; \
121  for (l = 0; l < n2; l++, w_ptr--) \
122  *u_ptr++ = a * (b * (S1 * (*w_ptr)/(1.0+*xx_ptr++) ) * (*x_ptr++) + S1 * (*w_ptr) * (*y_ptr++)); \
123 }
124 
125 ABUVXPWY_SYMMETRIC_2(abuvxpwy_symmetric2_1,1.0)
126 ABUVXPWY_SYMMETRIC_2(abuvxpwy_symmetric2_2,-1.0)
127 
128 static inline void auvxpwy(double a, double _Complex* u, double _Complex* x, double* v,
129  double _Complex* y, double* w, int n)
130 {
131  int l;
132  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
133  double *v_ptr = v, *w_ptr = w;
134  for (l = n; l > 0; l--)
135  *u_ptr++ = a * ((*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
136 }
137 
138 static inline void auvxpwy_symmetric(double a, double _Complex* u, double _Complex* x,
139  double* v, double _Complex* y, double* w, int n)
140 {
141  const int n2 = n>>1; \
142  int l;
143  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
144  double *v_ptr = v, *w_ptr = w;
145  for (l = n2; l > 0; l--)
146  *u_ptr++ = a * ((*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
147  v_ptr--; w_ptr--;
148  for (l = n2; l > 0; l--)
149  *u_ptr++ = a * ((*v_ptr--) * (*x_ptr++) - (*w_ptr--) * (*y_ptr++));
150 }
151 
152 static inline void auvxpwy_symmetric_1(double a, double _Complex* u, double _Complex* x,
153  double* v, double _Complex* y, double* w, int n, double *xx)
154 {
155  const int n2 = n>>1; \
156  int l;
157  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
158  double *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
159  for (l = n2; l > 0; l--, xx_ptr++)
160  *u_ptr++ = a * (((*v_ptr++)*(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr++)*(1.0+*xx_ptr)) * (*y_ptr++));
161  v_ptr--; w_ptr--;
162  for (l = n2; l > 0; l--, xx_ptr++)
163  *u_ptr++ = a * (-((*v_ptr--)*(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr--)*(1.0+*xx_ptr)) * (*y_ptr++));
164 }
165 
166 static inline void auvxpwy_symmetric_2(double a, double _Complex* u, double _Complex* x,
167  double* v, double _Complex* y, double* w, int n, double *xx)
168 {
169  const int n2 = n>>1; \
170  int l;
171  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
172  double *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
173  for (l = n2; l > 0; l--, xx_ptr++)
174  *u_ptr++ = a * (((*v_ptr++)/(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr++)/(1.0+*xx_ptr)) * (*y_ptr++));
175  v_ptr--; w_ptr--;
176  for (l = n2; l > 0; l--, xx_ptr++)
177  *u_ptr++ = a * (-((*v_ptr--)/(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr--)/(1.0+*xx_ptr)) * (*y_ptr++));
178 }
179 
180 #define FPT_DO_STEP(NAME,M1_FUNCTION,M2_FUNCTION) \
181 static inline void NAME(double _Complex *a, double _Complex *b, double *a11, double *a12, \
182  double *a21, double *a22, double g, int tau, fpt_set set) \
183 { \
184  \
185  int length = 1<<(tau+1); \
186  \
187  double norm = 1.0/(length<<1); \
188  \
189  /* Compensate for factors introduced by a raw DCT-III. */ \
190  a[0] *= 2.0; \
191  b[0] *= 2.0; \
192  \
193  /* Compute function values from Chebyshev-coefficients using a DCT-III. */ \
194  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a); \
195  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b); \
196  \
197  /*for (k = 0; k < length; k++)*/ \
198  /*{*/ \
199  /*fprintf(stderr,"fpt_do_step: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",*/ \
200  /* a11[k],a12[k],a21[k],a22[k]);*/ \
201  /*}*/ \
202  \
203  /* Check, if gamma is zero. */ \
204  if (g == 0.0) \
205  { \
206  /*fprintf(stderr,"gamma == 0!\n");*/ \
207  /* Perform multiplication only for second row. */ \
208  M2_FUNCTION(norm,b,b,a22,a,a21,length); \
209  } \
210  else \
211  { \
212  /*fprintf(stderr,"gamma != 0!\n");*/ \
213  /* Perform multiplication for both rows. */ \
214  M2_FUNCTION(norm,set->z,b,a22,a,a21,length); \
215  M1_FUNCTION(norm*g,a,a,a11,b,a12,length); \
216  memcpy(b,set->z,length*sizeof(double _Complex)); \
217  /* Compute Chebyshev-coefficients using a DCT-II. */ \
218  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a); \
219  /* Compensate for factors introduced by a raw DCT-II. */ \
220  a[0] *= 0.5; \
221  } \
222  \
223  /* Compute Chebyshev-coefficients using a DCT-II. */ \
224  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b); \
225  /* Compensate for factors introduced by a raw DCT-II. */ \
226  b[0] *= 0.5; \
227 }
228 
229 FPT_DO_STEP(fpt_do_step,auvxpwy,auvxpwy)
230 FPT_DO_STEP(fpt_do_step_symmetric,auvxpwy_symmetric,auvxpwy_symmetric)
231 /*FPT_DO_STEP(fpt_do_step_symmetric_u,auvxpwy_symmetric,auvxpwy)
232 FPT_DO_STEP(fpt_do_step_symmetric_l,auvxpwy,auvxpwy_symmetric)*/
233 
234 static inline void fpt_do_step_symmetric_u(double _Complex *a, double _Complex *b,
235  double *a11, double *a12, double *a21, double *a22, double *x,
236  double gam, int tau, fpt_set set)
237 {
239  int length = 1<<(tau+1);
241  double norm = 1.0/(length<<1);
242 
243  UNUSED(a21); UNUSED(a22);
244 
245  /* Compensate for factors introduced by a raw DCT-III. */
246  a[0] *= 2.0;
247  b[0] *= 2.0;
248 
249  /* Compute function values from Chebyshev-coefficients using a DCT-III. */
250  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
251  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
252 
253  /*for (k = 0; k < length; k++)*/
254  /*{*/
255  /*fprintf(stderr,"fpt_do_step: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",*/
256  /* a11[k],a12[k],a21[k],a22[k]);*/
257  /*}*/
258 
259  /* Check, if gamma is zero. */
260  if (gam == 0.0)
261  {
262  /*fprintf(stderr,"gamma == 0!\n");*/
263  /* Perform multiplication only for second row. */
264  auvxpwy_symmetric_1(norm,b,b,a12,a,a11,length,x);
265  }
266  else
267  {
268  /*fprintf(stderr,"gamma != 0!\n");*/
269  /* Perform multiplication for both rows. */
270  auvxpwy_symmetric_1(norm,set->z,b,a12,a,a11,length,x);
271  auvxpwy_symmetric(norm*gam,a,a,a11,b,a12,length);
272  memcpy(b,set->z,length*sizeof(double _Complex));
273  /* Compute Chebyshev-coefficients using a DCT-II. */
274  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
275  /* Compensate for factors introduced by a raw DCT-II. */
276  a[0] *= 0.5;
277  }
278 
279  /* Compute Chebyshev-coefficients using a DCT-II. */
280  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
281  /* Compensate for factors introduced by a raw DCT-II. */
282  b[0] *= 0.5;
283 }
284 
285 static inline void fpt_do_step_symmetric_l(double _Complex *a, double _Complex *b,
286  double *a11, double *a12, double *a21, double *a22, double *x, double gam, int tau, fpt_set set)
287 {
289  int length = 1<<(tau+1);
291  double norm = 1.0/(length<<1);
292 
293  /* Compensate for factors introduced by a raw DCT-III. */
294  a[0] *= 2.0;
295  b[0] *= 2.0;
296 
297  UNUSED(a11); UNUSED(a12);
298 
299  /* Compute function values from Chebyshev-coefficients using a DCT-III. */
300  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
301  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
302 
303  /*for (k = 0; k < length; k++)*/
304  /*{*/
305  /*fprintf(stderr,"fpt_do_step: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",*/
306  /* a11[k],a12[k],a21[k],a22[k]);*/
307  /*}*/
308 
309  /* Check, if gamma is zero. */
310  if (gam == 0.0)
311  {
312  /*fprintf(stderr,"gamma == 0!\n");*/
313  /* Perform multiplication only for second row. */
314  auvxpwy_symmetric(norm,b,b,a22,a,a21,length);
315  }
316  else
317  {
318  /*fprintf(stderr,"gamma != 0!\n");*/
319  /* Perform multiplication for both rows. */
320  auvxpwy_symmetric(norm,set->z,b,a22,a,a21,length);
321  auvxpwy_symmetric_2(norm*gam,a,a,a21,b,a22,length,x);
322  memcpy(b,set->z,length*sizeof(double _Complex));
323  /* Compute Chebyshev-coefficients using a DCT-II. */
324  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
325  /* Compensate for factors introduced by a raw DCT-II. */
326  a[0] *= 0.5;
327  }
328 
329  /* Compute Chebyshev-coefficients using a DCT-II. */
330  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
331  /* Compensate for factors introduced by a raw DCT-II. */
332  b[0] *= 0.5;
333 }
334 
335 #define FPT_DO_STEP_TRANSPOSED(NAME,M1_FUNCTION,M2_FUNCTION) \
336 static inline void NAME(double _Complex *a, double _Complex *b, double *a11, \
337  double *a12, double *a21, double *a22, double g, int tau, fpt_set set) \
338 { \
339  \
340  int length = 1<<(tau+1); \
341  \
342  double norm = 1.0/(length<<1); \
343  \
344  /* Compute function values from Chebyshev-coefficients using a DCT-III. */ \
345  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a); \
346  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b); \
347  \
348  /* Perform matrix multiplication. */ \
349  M1_FUNCTION(norm,g,set->z,a,a11,b,a21,length); \
350  M2_FUNCTION(norm,g,b,a,a12,b,a22,length); \
351  memcpy(a,set->z,length*sizeof(double _Complex)); \
352  \
353  /* Compute Chebyshev-coefficients using a DCT-II. */ \
354  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a); \
355  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b); \
356 }
357 
358 FPT_DO_STEP_TRANSPOSED(fpt_do_step_t,abuvxpwy,abuvxpwy)
359 FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric,abuvxpwy_symmetric1,abuvxpwy_symmetric2)
360 /*FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric_u,abuvxpwy_symmetric1_1,abuvxpwy_symmetric1_2)*/
361 /*FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric_l,abuvxpwy_symmetric2_2,abuvxpwy_symmetric2_1)*/
362 
363 
364 static inline void fpt_do_step_t_symmetric_u(double _Complex *a,
365  double _Complex *b, double *a11, double *a12, double *x,
366  double gam, int tau, fpt_set set)
367 {
369  int length = 1<<(tau+1);
371  double norm = 1.0/(length<<1);
372 
373  /* Compute function values from Chebyshev-coefficients using a DCT-III. */
374  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
375  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
376 
377  /* Perform matrix multiplication. */
378  abuvxpwy_symmetric1_1(norm,gam,set->z,a,a11,b,length,x);
379  abuvxpwy_symmetric1_2(norm,gam,b,a,a12,b,length,x);
380  memcpy(a,set->z,length*sizeof(double _Complex));
381 
382  /* Compute Chebyshev-coefficients using a DCT-II. */
383  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
384  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
385 }
386 
387 static inline void fpt_do_step_t_symmetric_l(double _Complex *a,
388  double _Complex *b, double *a21, double *a22,
389  double *x, double gam, int tau, fpt_set set)
390 {
392  int length = 1<<(tau+1);
394  double norm = 1.0/(length<<1);
395 
396  /* Compute function values from Chebyshev-coefficients using a DCT-III. */
397  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
398  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
399 
400  /* Perform matrix multiplication. */
401  abuvxpwy_symmetric2_2(norm,gam,set->z,a,b,a21,length,x);
402  abuvxpwy_symmetric2_1(norm,gam,b,a,b,a22,length,x);
403  memcpy(a,set->z,length*sizeof(double _Complex));
404 
405  /* Compute Chebyshev-coefficients using a DCT-II. */
406  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
407  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
408 }
409 
410 
411 static void eval_clenshaw(const double *x, double *y, int size, int k, const double *alpha,
412  const double *beta, const double *gam)
413 {
414  /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
415  * of knots x[0], ..., x[size-1] by the Clenshaw algorithm
416  */
417  int i,j;
418  const double *x_act;
419  double *y_act;
420  const double *alpha_act, *beta_act, *gamma_act;
421 
422  /* Traverse all nodes. */
423  x_act = x;
424  y_act = y;
425  for (i = 0; i < size; i++)
426  {
427  double x_val_act = *x_act;
428 
429  if (k == 0)
430  {
431  *y_act = 1.0;
432  }
433  else
434  {
435  alpha_act = &(alpha[k]);
436  beta_act = &(beta[k]);
437  gamma_act = &(gam[k]);
438 
439 #ifdef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
440  long double a = 1.0;
441  long double b = 0.0;
442  for (j = k; j > 1; j--)
443  {
444  long double a_old = a;
445  a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
446  b = a_old*(*gamma_act);
447  alpha_act--;
448  beta_act--;
449  gamma_act--;
450  }
451  *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
452 #else
453  double a = 1.0;
454  double b = 0.0;
455  /* 1e247 should not trigger for NFSFT with N <= 1024 */
456  for (j = k; j > 1 && fabs(a) < 1e247; j--)
457  {
458  double a_old = a;
459  a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
460  b = a_old*(*gamma_act);
461  alpha_act--;
462  beta_act--;
463  gamma_act--;
464  }
465  if (j <= 1)
466  *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
467  else /* fabs(a) >= 1e247, continue in long double */
468  {
469  long double a_ld = a;
470  long double b_ld = b;
471  for (; j > 1; j--)
472  {
473  long double a_old = a_ld;
474  a_ld = b_ld + a_old*((*alpha_act)*x_val_act+(*beta_act));
475  b_ld = a_old*(*gamma_act);
476  alpha_act--;
477  beta_act--;
478  gamma_act--;
479  }
480  *y_act = (a_ld*((*alpha_act)*x_val_act+(*beta_act))+b_ld);
481  }
482 #endif
483 
484  }
485  x_act++;
486  y_act++;
487  }
488 }
489 
490 static void eval_clenshaw2(const double *x, double *z, double *y, int size1, int size, int k, const double *alpha,
491  const double *beta, const double *gam)
492 {
493  /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
494  * of knots x[0], ..., x[size-1] by the Clenshaw algorithm
495  */
496  int i,j;
497  double a,b,x_val_act,a_old;
498  const double *x_act;
499  double *y_act, *z_act;
500  const double *alpha_act, *beta_act, *gamma_act;
501 
502  /* Traverse all nodes. */
503  x_act = x;
504  y_act = y;
505  z_act = z;
506  for (i = 0; i < size; i++)
507  {
508  a = 1.0;
509  b = 0.0;
510  x_val_act = *x_act;
511 
512  if (k == 0)
513  {
514  *y_act = 1.0;
515  *z_act = 0.0;
516  }
517  else
518  {
519  alpha_act = &(alpha[k]);
520  beta_act = &(beta[k]);
521  gamma_act = &(gam[k]);
522  for (j = k; j > 1; j--)
523  {
524  a_old = a;
525  a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
526  b = a_old*(*gamma_act);
527  alpha_act--;
528  beta_act--;
529  gamma_act--;
530  }
531  if (i < size1)
532  *z_act = a;
533  *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
534  }
535 
536  x_act++;
537  y_act++;
538  z_act++;
539  }
540  /*gamma_act++;
541  FILE *f = fopen("/Users/keiner/Desktop/nfsft_debug.txt","a");
542  fprintf(f,"size1: %10d, size: %10d\n",size1,size);
543  fclose(f);*/
544 }
545 
546 static int eval_clenshaw_thresh2(const double *x, double *z, double *y, int size, int k,
547  const double *alpha, const double *beta, const double *gam, const
548  double threshold)
549 {
550  /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
551  * of knots x[0], ..., x[size-1] by the Clenshaw algorithm
552  */
553  int i,j;
554  double x_val_act;
555  const double *x_act;
556  double *y_act, *z_act;
557  const double *alpha_act, *beta_act, *gamma_act;
558  const R threshold_abs = FABS(threshold);
559 
560  /* Traverse all nodes. */
561  x_act = x;
562  y_act = y;
563  z_act = z;
564  for (i = 0; i < size; i++)
565  {
566  x_val_act = *x_act;
567 
568  if (k == 0)
569  {
570  *y_act = 1.0;
571  *z_act = 0.0;
572  }
573  else
574  {
575  alpha_act = &(alpha[k]);
576  beta_act = &(beta[k]);
577  gamma_act = &(gam[k]);
578 
579 #ifdef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
580  long double a = 1.0;
581  long double b = 0.0;
582  for (j = k; j > 1; j--)
583  {
584  long double a_old = a;
585  a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
586  b = a_old*(*gamma_act);
587  alpha_act--;
588  beta_act--;
589  gamma_act--;
590  }
591  *z_act = a;
592  *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
593 #else
594  double a = 1.0;
595  double b = 0.0;
596  for (j = k; j > 1 && fabs(a) < 1e247; j--)
597  {
598  double a_old = a;
599  a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
600  b = a_old*(*gamma_act);
601  alpha_act--;
602  beta_act--;
603  gamma_act--;
604  }
605  if (j <= 1)
606  {
607  *z_act = a;
608  *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
609  }
610  else /* fabs(a) >= 1e247, continue in long double */
611  {
612  long double a_ld = a;
613  long double b_ld = b;
614  for (; j > 1; j--)
615  {
616  long double a_old = a_ld;
617  a_ld = b_ld + a_old*((*alpha_act)*x_val_act+(*beta_act));
618  b_ld = a_old*(*gamma_act);
619  alpha_act--;
620  beta_act--;
621  gamma_act--;
622  }
623  *z_act = a_ld;
624  *y_act = (a_ld*((*alpha_act)*x_val_act+(*beta_act))+b_ld);
625  }
626 #endif
627  if (FABS(*y_act) > threshold_abs)
628  return 1;
629  }
630  x_act++;
631  y_act++;
632  z_act++;
633  }
634  return 0;
635 }
636 
637 static inline void eval_sum_clenshaw_fast(const int N, const int M,
638  const double _Complex *a, const double *x, double _Complex *y,
639  const double *alpha, const double *beta, const double *gam,
640  const double lambda)
641 {
642  int j,k;
643 
644  if (N == 0)
645  for (j = 0; j <= M; j++)
646  y[j] = a[0];
647  else
648  {
649  for (j = 0; j <= M; j++)
650  {
651  double xc = x[j];
652 #ifdef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
653  long double _Complex tmp1 = a[N-1];
654  long double _Complex tmp2 = a[N];
655  for (k = N-1; k > 0; k--)
656  {
657  long double _Complex tmp3 = a[k-1] + tmp2 * gam[k];
658  tmp2 *= (alpha[k] * xc + beta[k]);
659  tmp2 += tmp1;
660  tmp1 = tmp3;
661  }
662  tmp2 *= (alpha[0] * xc + beta[0]);
663  y[j] = lambda * (tmp2 + tmp1);
664 #else
665  double _Complex tmp1 = a[N-1];
666  double _Complex tmp2 = a[N];
667  /* 1e247 should not trigger for NFSFT with N <= 1024 */
668  for (k = N-1; k > 0 && fabs(creal(tmp2)) < 1e247 && fabs(cimag(tmp2)) < 1e247; k--)
669  {
670  double _Complex tmp3 = a[k-1] + tmp2 * gam[k];
671  tmp2 *= (alpha[k] * xc + beta[k]);
672  tmp2 += tmp1;
673  tmp1 = tmp3;
674  }
675  if (k <= 0)
676  {
677  tmp2 *= (alpha[0] * xc + beta[0]);
678  y[j] = lambda * (tmp2 + tmp1);
679  }
680  else /* fabs(tmp2) >= 1e247 */
681  {
682  long double _Complex tmp1_ld = tmp1;
683  long double _Complex tmp2_ld = tmp2;
684  for (; k > 0; k--)
685  {
686  long double _Complex tmp3_ld = a[k-1] + tmp2_ld * gam[k];
687  tmp2_ld *= (alpha[k] * xc + beta[k]);
688  tmp2_ld += tmp1_ld;
689  tmp1_ld = tmp3_ld;
690  }
691  tmp2_ld *= (alpha[0] * xc + beta[0]);
692  y[j] = lambda * (tmp2_ld + tmp1_ld);
693  } /* end fabs(tmp2) >= 1e247 */
694 #endif
695  } /* for j */
696  } /* N > 0 */
697 }
698 
717 static void eval_sum_clenshaw_transposed(int N, int M, double _Complex* a, double *x,
718  double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam,
719  double lambda)
720 {
721  int j,k;
722  double _Complex* it1 = temp;
723  double _Complex* it2 = y;
724  double _Complex aux;
725 
726  /* Compute final result by multiplying with the constant lambda */
727  a[0] = 0.0;
728  for (j = 0; j <= M; j++)
729  {
730  it2[j] = lambda * y[j];
731  a[0] += it2[j];
732  }
733 
734  if (N > 0)
735  {
736  /* Compute final step. */
737  a[1] = 0.0;
738  for (j = 0; j <= M; j++)
739  {
740  it1[j] = it2[j];
741  it2[j] = it2[j] * (alpha[0] * x[j] + beta[0]);
742  a[1] += it2[j];
743  }
744 
745  for (k = 2; k <= N; k++)
746  {
747  a[k] = 0.0;
748  for (j = 0; j <= M; j++)
749  {
750  aux = it1[j];
751  it1[j] = it2[j];
752  it2[j] = it2[j]*(alpha[k-1] * x[j] + beta[k-1]) + gam[k-1] * aux;
753  a[k] += it2[j];
754  }
755  }
756  }
757 }
758 
759 static void eval_sum_clenshaw_transposed_ld(int N, int M, double _Complex* a, double *x,
760  double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam,
761  double lambda)
762 {
763  int j,k;
764 
765  for (k = 0; k <= N; k++)
766  a[k] = 0.0;
767 
768  if (N == 0)
769  for (j = 0; j <= M; j++)
770  a[0] += lambda * y[j];
771  else
772  {
773  for (j = 0; j <= M; j++)
774  {
775  /* Compute final result by multiplying with the constant lambda */
776  long double _Complex it2 = lambda * y[j];
777  a[0] += it2;
778 
779  /* Compute final step. */
780  long double _Complex it1 = it2;
781  it2 = it2 * (alpha[0] * x[j] + beta[0]);
782  a[1] += it2;
783 
784  for (k = 2; k <= N; k++)
785  {
786  long double _Complex aux = it1;
787  it1 = it2;
788  it2 = it2*(alpha[k-1] * x[j] + beta[k-1]) + gam[k-1] * aux;
789  a[k] += it2;
790  }
791  }
792  }
793 }
794 
795 fpt_set fpt_init(const int M, const int t, const unsigned int flags)
796 {
798  int plength;
800  int tau;
802  int m;
803  int k;
804 #ifdef _OPENMP
805  int nthreads = X(get_num_threads)();
806 #endif
807 
808  /* Allocate memory for new DPT set. */
809  fpt_set_s *set = (fpt_set_s*)nfft_malloc(sizeof(fpt_set_s));
810 
811  /* Save parameters in structure. */
812  set->flags = flags;
813 
814  set->M = M;
815  set->t = t;
816  set->N = 1<<t;
817 
818  if (!(flags & FPT_NO_INIT_FPT_DATA))
819  {
820  /* Allocate memory for M transforms. */
821  set->dpt = (fpt_data*) nfft_malloc(M*sizeof(fpt_data));
822 
823  /* Initialize with NULL pointer. */
824  for (m = 0; m < set->M; m++)
825  {
826  set->dpt[m].steps = NULL;
827  set->dpt[m].precomputed = false;
828  }
829  }
830  else
831  set->dpt = NULL;
832 
833  /* Create arrays with Chebyshev nodes. */
834 
835  /* Initialize array with Chebyshev coefficients for the polynomial x. This
836  * would be trivially an array containing a 1 as second entry with all other
837  * coefficients set to zero. In order to compensate for the multiplicative
838  * factor 2 introduced by the DCT-III, we set this coefficient to 0.5 here. */
839 
840  /* Allocate memory for array of pointers to node arrays. */
841  set->xcvecs = (double**) nfft_malloc((set->t)*sizeof(double*));
842  /* For each polynomial length starting with 4, compute the Chebyshev nodes
843  * using a DCT-III. */
844  plength = 4;
845  for (tau = 1; tau < t+1; tau++)
846  {
847  /* Allocate memory for current array. */
848  set->xcvecs[tau-1] = (double*) nfft_malloc(plength*sizeof(double));
849  for (k = 0; k < plength; k++)
850  {
851  set->xcvecs[tau-1][k] = cos(((k+0.5)*KPI)/plength);
852  }
853  plength = plength << 1;
854  }
855 
857  set->work = (double _Complex*) nfft_malloc((2*set->N)*sizeof(double _Complex));
858  set->result = (double _Complex*) nfft_malloc((2*set->N)*sizeof(double _Complex));
859 
860  set->plans_dct2 = (fftw_plan*) nfft_malloc(sizeof(fftw_plan)*(set->t/*-1*/));
861  set->kindsr = (fftw_r2r_kind*) nfft_malloc(2*sizeof(fftw_r2r_kind));
862  set->kindsr[0] = FFTW_REDFT10;
863  set->kindsr[1] = FFTW_REDFT10;
864  for (tau = 0, plength = 4; tau < set->t/*-1*/; tau++, plength<<=1)
865  {
866 #ifdef _OPENMP
867 #pragma omp critical (nfft_omp_critical_fftw_plan)
868 {
869  fftw_plan_with_nthreads(nthreads);
870 #endif
871  set->plans_dct2[tau] =
872  fftw_plan_many_r2r(1, &plength, 2, (double*)set->work, NULL,
873  2, 1, (double*)set->result, NULL, 2, 1,set->kindsr,
874  0);
875 #ifdef _OPENMP
876 }
877 #endif
878  }
879 
881  set->plans_dct3 = (fftw_plan*) nfft_malloc(sizeof(fftw_plan)*(set->t/*-1*/));
882  set->kinds = (fftw_r2r_kind*) nfft_malloc(2*sizeof(fftw_r2r_kind));
883  set->kinds[0] = FFTW_REDFT01;
884  set->kinds[1] = FFTW_REDFT01;
885  for (tau = 0, plength = 4; tau < set->t/*-1*/; tau++, plength<<=1)
886  {
887 #ifdef _OPENMP
888 #pragma omp critical (nfft_omp_critical_fftw_plan)
889 {
890  fftw_plan_with_nthreads(nthreads);
891 #endif
892  set->plans_dct3[tau] =
893  fftw_plan_many_r2r(1, &plength, 2, (double*)set->work, NULL,
894  2, 1, (double*)set->result, NULL, 2, 1, set->kinds,
895  0);
896 #ifdef _OPENMP
897 }
898 #endif
899  }
900  nfft_free(set->kinds);
901  nfft_free(set->kindsr);
902  set->kinds = NULL;
903  set->kindsr = NULL;
904 
905  set->vec3 = NULL;
906  set->vec4 = NULL;
907  set->z = NULL;
908 
909  set->xc_slow = NULL;
910  set->temp = NULL;
911 
912  /* Check if fast transform is activated. */
913  if (!(set->flags & FPT_NO_FAST_ALGORITHM))
914  {
916  set->vec3 = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
917  set->vec4 = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
918  set->z = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
919  }
920 
921  if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
922  {
923  set->xc_slow = (double*) nfft_malloc((set->N+1)*sizeof(double));
924  set->temp = (double _Complex*) nfft_malloc((set->N+1)*sizeof(double _Complex));
925 
926  if (!(flags & FPT_NO_INIT_FPT_DATA))
927  {
928  for (m = 0; m < set->M; m++)
929  {
930  fpt_data *data = &(set->dpt[m]);
931  data->_alpha = NULL;
932  data->_beta = NULL;
933  data->_gamma = NULL;
934  }
935  }
936  }
937 
938  /* Return the newly created DPT set. */
939  return set;
940 }
941 
942 void fpt_precompute_1(fpt_set set, const int m, int k_start)
943 {
944  int tau;
945  int l;
946  int plength;
948  int degree;
950  int firstl;
951  int lastl;
952  int k_start_tilde;
953  int N_tilde;
954  int clength;
955  fpt_data *data;
956 
957  /* Get pointer to DPT data. */
958  data = &(set->dpt[m]);
959 
960  /* Check, if already precomputed. */
961  if (data->steps != NULL)
962  return;
963 
964  /* Save k_start. */
965  data->k_start = k_start;
966 
967  data->alphaN = NULL;
968  data->betaN = NULL;
969  data->gammaN = NULL;
970 
971  if (!(set->flags & FPT_NO_FAST_ALGORITHM))
972  {
973  /* Save recursion coefficients. */
974  data->alphaN = (double*) nfft_malloc(3*(set->t-1)*sizeof(double));
975  data->betaN = data->alphaN + (set->t-1);
976  data->gammaN = data->betaN + (set->t-1);
977 
978  k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
979  /*set->N*/);
980  N_tilde = N_TILDE(set->N);
981 
982  /* Allocate memory for the cascade with t = log_2(N) many levels. */
983  data->steps = (fpt_step**) nfft_malloc(sizeof(fpt_step*)*set->t);
984 
985  plength = 4;
986  for (tau = 1; tau < set->t; tau++)
987  {
988  /* Compute auxilliary values. */
989  degree = plength>>1;
990  /* Compute first l. */
991  firstl = FIRST_L(k_start_tilde,plength);
992  /* Compute last l. */
993  lastl = LAST_L(N_tilde,plength);
994 
995  /* Allocate memory for current level. This level will contain 2^{t-tau-1}
996  * many matrices. */
997  data->steps[tau] = (fpt_step*) nfft_malloc(sizeof(fpt_step)
998  * (lastl+1));
999 
1000  /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
1001  for (l = firstl; l <= lastl; l++)
1002  {
1003  if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1004  {
1005  clength = plength/2;
1006  }
1007  else
1008  {
1009  clength = plength;
1010  }
1011 
1012  /* Allocate memory for the components of U_{n,tau,l}. */
1013  data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*clength*4);
1014  }
1016  plength = plength << 1;
1017  }
1018  }
1019 
1020  if (!(set->flags & FPT_NO_DIRECT_ALGORITHM) && !(set->flags & FPT_PERSISTENT_DATA) && (data->_alpha == NULL))
1021  {
1022  data->_alpha = (double*) nfft_malloc(3*(set->N+1)*sizeof(double));
1023  data->_beta = data->_alpha + (set->N+1);
1024  data->_gamma = data->_beta + (set->N+1);
1025  }
1026 }
1027 
1028 void fpt_precompute_2(fpt_set set, const int m, double *alpha, double *beta,
1029  double *gam, int k_start, const double threshold)
1030 {
1031 
1032  int tau;
1033  int l;
1034  int plength;
1036  int degree;
1038  int firstl;
1039  int lastl;
1040  int plength_stab;
1042  int degree_stab;
1044  /*double *a11;*/
1046  /*double *a12;*/
1048  /*double *a21;*/
1050  /*double *a22;*/
1052  const double *calpha;
1053  const double *cbeta;
1054  const double *cgamma;
1055  int needstab = 0;
1056  int k_start_tilde;
1057  int N_tilde;
1058  int clength;
1059  int clength_1;
1060  int clength_2;
1061  int t_stab, N_stab;
1062  fpt_data *data;
1063 
1064  /* Get pointer to DPT data. */
1065  data = &(set->dpt[m]);
1066 
1067  /* Check, if already precomputed. */
1068  if ((data->steps != NULL) && (data->precomputed))
1069  return;
1070 
1071  /* Save k_start. */
1072  data->k_start = k_start;
1073 
1074  data->gamma_m1 = gam[0];
1075 /* moved to fpt_precompute_1
1076  data->alphaN = NULL;
1077  data->betaN = NULL;
1078  data->gammaN = NULL;*/
1079 
1080  /* Check if fast transform is activated. */
1081  if (!(set->flags & FPT_NO_FAST_ALGORITHM))
1082  {
1083  /* Save recursion coefficients. moved to fpt_precompute_1
1084  data->alphaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
1085  data->betaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
1086  data->gammaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex)); */
1087 
1088  for (tau = 2; tau <= set->t; tau++)
1089  {
1090 
1091  data->alphaN[tau-2] = alpha[1<<tau];
1092  data->betaN[tau-2] = beta[1<<tau];
1093  data->gammaN[tau-2] = gam[1<<tau];
1094  }
1095 
1096  data->alpha_0 = alpha[1];
1097  data->beta_0 = beta[1];
1098 
1099  k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
1100  /*set->N*/);
1101  N_tilde = N_TILDE(set->N);
1102 
1103  /* Allocate memory for the cascade with t = log_2(N) many levels. moved to fpt_precompute_1
1104  data->steps = (fpt_step**) nfft_malloc(sizeof(fpt_step*)*set->t); */
1105 
1106  /* For tau = 1,...t compute the matrices U_{n,tau,l}. */
1107  plength = 4;
1108  for (tau = 1; tau < set->t; tau++)
1109  {
1110  /* Compute auxilliary values. */
1111  degree = plength>>1;
1112  /* Compute first l. */
1113  firstl = FIRST_L(k_start_tilde,plength);
1114  /* Compute last l. */
1115  lastl = LAST_L(N_tilde,plength);
1116 
1117  /* Allocate memory for current level. This level will contain 2^{t-tau-1}
1118  * many matrices. moved to fpt_precompute_1
1119  data->steps[tau] = (fpt_step*) nfft_malloc(sizeof(fpt_step)
1120  * (lastl+1)); */
1121 
1122  /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
1123  for (l = firstl; l <= lastl; l++)
1124  {
1125  if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1126  {
1127  //fprintf(stderr,"fpt_precompute(%d): symmetric step\n",m);
1128  //fflush(stderr);
1129  clength = plength/2;
1130  }
1131  else
1132  {
1133  clength = plength;
1134  }
1135 
1136  /* Allocate memory for the components of U_{n,tau,l}. moved to fpt_precompute_1
1137  data->steps[tau][l].a11 = (double*) nfft_malloc(sizeof(double)*clength);
1138  data->steps[tau][l].a12 = (double*) nfft_malloc(sizeof(double)*clength);
1139  data->steps[tau][l].a21 = (double*) nfft_malloc(sizeof(double)*clength);
1140  data->steps[tau][l].a22 = (double*) nfft_malloc(sizeof(double)*clength); */
1141 
1142  /* Evaluate the associated polynomials at the 2^{tau+1} Chebyshev
1143  * nodes. */
1144 
1145  /* Get the pointers to the three-term recurrence coeffcients. */
1146  calpha = &(alpha[plength*l+1+1]);
1147  cbeta = &(beta[plength*l+1+1]);
1148  cgamma = &(gam[plength*l+1+1]);
1149 
1150  double *a11 = data->steps[tau][l].a;
1151  double *a12 = a11+clength;
1152  double *a21 = a12+clength;
1153  double *a22 = a21+clength;
1154 
1155  if (set->flags & FPT_NO_STABILIZATION)
1156  {
1157  /* Evaluate P_{2^{tau}-2}^n(\cdot,2^{tau+1}l+2). */
1158  calpha--;
1159  cbeta--;
1160  cgamma--;
1161  eval_clenshaw2(set->xcvecs[tau-1], a11, a21, clength, clength, degree-1, calpha, cbeta,
1162  cgamma);
1163  eval_clenshaw2(set->xcvecs[tau-1], a12, a22, clength, clength, degree, calpha, cbeta,
1164  cgamma);
1165  needstab = 0;
1166  }
1167  else
1168  {
1169  calpha--;
1170  cbeta--;
1171  cgamma--;
1172  /* Evaluate P_{2^{tau}-1}^n(\cdot,2^{tau+1}l+1). */
1173  needstab = eval_clenshaw_thresh2(set->xcvecs[tau-1], a11, a21, clength,
1174  degree-1, calpha, cbeta, cgamma, threshold);
1175  if (needstab == 0)
1176  {
1177  /* Evaluate P_{2^{tau}}^n(\cdot,2^{tau+1}l+1). */
1178  needstab = eval_clenshaw_thresh2(set->xcvecs[tau-1], a12, a22, clength,
1179  degree, calpha, cbeta, cgamma, threshold);
1180  }
1181  }
1182 
1183 
1184  /* Check if stabilization needed. */
1185  if (needstab == 0)
1186  {
1187  /* No stabilization needed. */
1188  data->steps[tau][l].g = gam[plength*l+1+1];
1189  data->steps[tau][l].stable = true;
1190  }
1191  else
1192  {
1193  /* Stabilize. */
1194  degree_stab = degree*(2*l+1);
1195  X(next_power_of_2_exp_int)((l+1)*(1<<(tau+1)),&N_stab,&t_stab);
1196 
1197  /* Old arrays are to small. */
1198  nfft_free(data->steps[tau][l].a);
1199  a11 = NULL;
1200  a12 = NULL;
1201  a21 = NULL;
1202  a22 = NULL;
1203 
1204  plength_stab = N_stab;
1205 
1206  if (set->flags & FPT_AL_SYMMETRY)
1207  {
1208  if (m <= 1)
1209  {
1210  /* This should never be executed */
1211  clength_1 = plength_stab;
1212  clength_2 = plength_stab;
1213  /* Allocate memory for arrays. */
1214  data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*(clength_1*2+clength_2*2));
1215  a11 = data->steps[tau][l].a;
1216  a12 = a11+clength_1;
1217  a21 = a12+clength_1;
1218  a22 = a21+clength_2;
1219  /* Get the pointers to the three-term recurrence coeffcients. */
1220  calpha = &(alpha[1]); cbeta = &(beta[1]); cgamma = &(gam[1]);
1221  eval_clenshaw2(set->xcvecs[t_stab-2], a11, a21, clength_1,
1222  clength_2, degree_stab-1, calpha, cbeta, cgamma);
1223  eval_clenshaw2(set->xcvecs[t_stab-2], a12, a22, clength_1,
1224  clength_2, degree_stab+0, calpha, cbeta, cgamma);
1225  }
1226  else
1227  {
1228  clength = plength_stab/2;
1229  data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*clength*2);
1230  if (m%2 == 0)
1231  {
1232  a11 = data->steps[tau][l].a;
1233  a12 = a11+clength;
1234  calpha = &(alpha[2]); cbeta = &(beta[2]); cgamma = &(gam[2]);
1235  eval_clenshaw(set->xcvecs[t_stab-2], a11, clength,
1236  degree_stab-2, calpha, cbeta, cgamma);
1237  eval_clenshaw(set->xcvecs[t_stab-2], a12, clength,
1238  degree_stab-1, calpha, cbeta, cgamma);
1239  }
1240  else
1241  {
1242  a21 = data->steps[tau][l].a;
1243  a22 = a21+clength;
1244  calpha = &(alpha[1]); cbeta = &(beta[1]); cgamma = &(gam[1]);
1245  eval_clenshaw(set->xcvecs[t_stab-2], a21, clength,
1246  degree_stab-1,calpha, cbeta, cgamma);
1247  eval_clenshaw(set->xcvecs[t_stab-2], a22, clength,
1248  degree_stab+0, calpha, cbeta, cgamma);
1249  }
1250  }
1251  }
1252  else
1253  {
1254  clength_1 = plength_stab;
1255  clength_2 = plength_stab;
1256  data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*(clength_1*2+clength_2*2));
1257  a11 = data->steps[tau][l].a;
1258  a12 = a11+clength_1;
1259  a21 = a12+clength_1;
1260  a22 = a21+clength_2;
1261  calpha = &(alpha[2]);
1262  cbeta = &(beta[2]);
1263  cgamma = &(gam[2]);
1264  calpha--;
1265  cbeta--;
1266  cgamma--;
1267  eval_clenshaw2(set->xcvecs[t_stab-2], a11, a21, clength_1, clength_2, degree_stab-1,
1268  calpha, cbeta, cgamma);
1269  eval_clenshaw2(set->xcvecs[t_stab-2], a12, a22, clength_1, clength_2, degree_stab+0,
1270  calpha, cbeta, cgamma);
1271 
1272  }
1273 
1274  data->steps[tau][l].g = gam[1+1];
1275  data->steps[tau][l].stable = false;
1276  data->steps[tau][l].ts = t_stab;
1277  data->steps[tau][l].Ns = N_stab;
1278  }
1279  }
1281  plength = plength << 1;
1282  }
1283  data->precomputed = true;
1284  }
1285 
1286  if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
1287  {
1288  /* Check, if recurrence coefficients must be copied. */
1289  if (set->flags & FPT_PERSISTENT_DATA)
1290  {
1291  data->_alpha = (double*) alpha;
1292  data->_beta = (double*) beta;
1293  data->_gamma = (double*) gam;
1294  }
1295  else
1296  {/* moved to fpt_precompute_1
1297  data->_alpha = (double*) nfft_malloc((set->N+1)*sizeof(double));
1298  data->_beta = (double*) nfft_malloc((set->N+1)*sizeof(double));
1299  data->_gamma = (double*) nfft_malloc((set->N+1)*sizeof(double));*/
1300  memcpy(data->_alpha,alpha,(set->N+1)*sizeof(double));
1301  memcpy(data->_beta,beta,(set->N+1)*sizeof(double));
1302  memcpy(data->_gamma,gam,(set->N+1)*sizeof(double));
1303  }
1304  }
1305 }
1306 
1307 void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
1308  double *gam, int k_start, const double threshold)
1309 {
1310  fpt_precompute_1(set, m, k_start);
1311  fpt_precompute_2(set, m, alpha, beta, gam, k_start, threshold);
1312 }
1313 
1314 void fpt_trafo_direct(fpt_set set, const int m, const double _Complex *x, double _Complex *y,
1315  const int k_end, const unsigned int flags)
1316 {
1317  int j;
1318  fpt_data *data = &(set->dpt[m]);
1319  int Nk;
1320  int tk;
1321  double norm;
1322 
1323  //fprintf(stderr, "Executing dpt.\n");
1324 
1325  X(next_power_of_2_exp_int)(k_end+1,&Nk,&tk);
1326  norm = 2.0/(Nk<<1);
1327 
1328  //fprintf(stderr, "Norm = %e.\n", norm);
1329 
1330  if (set->flags & FPT_NO_DIRECT_ALGORITHM)
1331  {
1332  return;
1333  }
1334 
1335  if (flags & FPT_FUNCTION_VALUES)
1336  {
1337  /* Fill array with Chebyshev nodes. */
1338  for (j = 0; j <= k_end; j++)
1339  {
1340  set->xc_slow[j] = cos((KPI*(j+0.5))/(k_end+1));
1341  //fprintf(stderr, "x[%4d] = %e.\n", j, set->xc_slow[j]);
1342  }
1343 
1344  memset(set->result,0U,data->k_start*sizeof(double _Complex));
1345  memcpy(&set->result[data->k_start],x,(k_end-data->k_start+1)*sizeof(double _Complex));
1346 
1347  /*eval_sum_clenshaw(k_end, k_end, set->result, set->xc_slow,
1348  y, set->work, &data->alpha[1], &data->beta[1], &data->gamma[1],
1349  data->gamma_m1);*/
1350  eval_sum_clenshaw_fast(k_end, k_end, set->result, set->xc_slow,
1351  y, &data->_alpha[1], &data->_beta[1], &data->_gamma[1], data->gamma_m1);
1352  }
1353  else
1354  {
1355  memset(set->temp,0U,data->k_start*sizeof(double _Complex));
1356  memcpy(&set->temp[data->k_start],x,(k_end-data->k_start+1)*sizeof(double _Complex));
1357 
1358  eval_sum_clenshaw_fast(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
1359  set->result, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1360  data->gamma_m1);
1361 
1362  fftw_execute_r2r(set->plans_dct2[tk-2],(double*)set->result,
1363  (double*)set->result);
1364 
1365  set->result[0] *= 0.5;
1366  for (j = 0; j < Nk; j++)
1367  {
1368  set->result[j] *= norm;
1369  }
1370 
1371  memcpy(y,set->result,(k_end+1)*sizeof(double _Complex));
1372  }
1373 }
1374 
1375 void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Complex *y,
1376  const int k_end, const unsigned int flags)
1377 {
1378  /* Get transformation data. */
1379  fpt_data *data = &(set->dpt[m]);
1381  int Nk;
1383  int tk;
1385  int k_start_tilde;
1387  int k_end_tilde;
1388 
1390  int tau;
1392  int firstl;
1394  int lastl;
1396  int l;
1398  int plength;
1400  int plength_stab;
1401  int t_stab;
1403  fpt_step *step;
1405  fftw_plan plan = 0;
1406  int length = k_end+1;
1407  fftw_r2r_kind kinds[2] = {FFTW_REDFT01,FFTW_REDFT01};
1408 
1410  int k;
1411 
1412  double _Complex *work_ptr;
1413  const double _Complex *x_ptr;
1414 
1415  /* Check, if slow transformation should be used due to small bandwidth. */
1416  if (k_end < FPT_BREAK_EVEN)
1417  {
1418  /* Use NDSFT. */
1419  fpt_trafo_direct(set, m, x, y, k_end, flags);
1420  return;
1421  }
1422 
1423  X(next_power_of_2_exp_int)(k_end,&Nk,&tk);
1424  k_start_tilde = K_START_TILDE(data->k_start,Nk);
1425  k_end_tilde = K_END_TILDE(k_end,Nk);
1426 
1427  /* Check if fast transform is activated. */
1428  if (set->flags & FPT_NO_FAST_ALGORITHM)
1429  return;
1430 
1431  if (flags & FPT_FUNCTION_VALUES)
1432  {
1433 #ifdef _OPENMP
1434  int nthreads = X(get_num_threads)();
1435 #pragma omp critical (nfft_omp_critical_fftw_plan)
1436 {
1437  fftw_plan_with_nthreads(nthreads);
1438 #endif
1439  plan = fftw_plan_many_r2r(1, &length, 2, (double*)set->work, NULL, 2, 1,
1440  (double*)set->work, NULL, 2, 1, kinds, 0U);
1441 #ifdef _OPENMP
1442 }
1443 #endif
1444  }
1445 
1446  /* Initialize working arrays. */
1447  memset(set->result,0U,2*Nk*sizeof(double _Complex));
1448 
1449  /* The first step. */
1450 
1451  /* Set the first 2*data->k_start coefficients to zero. */
1452  memset(set->work,0U,2*data->k_start*sizeof(double _Complex));
1453 
1454  work_ptr = &set->work[2*data->k_start];
1455  x_ptr = x;
1456 
1457  for (k = 0; k <= k_end_tilde-data->k_start; k++)
1458  {
1459  *work_ptr++ = *x_ptr++;
1460  *work_ptr++ = K(0.0);
1461  }
1462 
1463  /* Set the last 2*(set->N-1-k_end_tilde) coefficients to zero. */
1464  memset(&set->work[2*(k_end_tilde+1)],0U,2*(Nk-1-k_end_tilde)*sizeof(double _Complex));
1465 
1466  /* If k_end == Nk, use three-term recurrence to map last coefficient x_{Nk} to
1467  * x_{Nk-1} and x_{Nk-2}. */
1468  if (k_end == Nk)
1469  {
1470  set->work[2*(Nk-2)] += data->gammaN[tk-2]*x[Nk-data->k_start];
1471  set->work[2*(Nk-1)] += data->betaN[tk-2]*x[Nk-data->k_start];
1472  set->work[2*(Nk-1)+1] = data->alphaN[tk-2]*x[Nk-data->k_start];
1473  }
1474 
1475  /* Compute the remaining steps. */
1476  plength = 4;
1477  for (tau = 1; tau < tk; tau++)
1478  {
1479  /* Compute first l. */
1480  firstl = FIRST_L(k_start_tilde,plength);
1481  /* Compute last l. */
1482  lastl = LAST_L(k_end_tilde,plength);
1483 
1484  /* Compute the multiplication steps. */
1485  for (l = firstl; l <= lastl; l++)
1486  {
1487  /* Copy vectors to multiply into working arrays zero-padded to twice the length. */
1488  memcpy(set->vec3,&(set->work[(plength/2)*(4*l+2)]),(plength/2)*sizeof(double _Complex));
1489  memcpy(set->vec4,&(set->work[(plength/2)*(4*l+3)]),(plength/2)*sizeof(double _Complex));
1490  memset(&set->vec3[plength/2],0U,(plength/2)*sizeof(double _Complex));
1491  memset(&set->vec4[plength/2],0U,(plength/2)*sizeof(double _Complex));
1492 
1493  /* Copy coefficients into first half. */
1494  memcpy(&(set->work[(plength/2)*(4*l+2)]),&(set->work[(plength/2)*(4*l+1)]),(plength/2)*sizeof(double _Complex));
1495  memset(&(set->work[(plength/2)*(4*l+1)]),0U,(plength/2)*sizeof(double _Complex));
1496  memset(&(set->work[(plength/2)*(4*l+3)]),0U,(plength/2)*sizeof(double _Complex));
1497 
1498  /* Get matrix U_{n,tau,l} */
1499  step = &(data->steps[tau][l]);
1500 
1501  /* Check if step is stable. */
1502  if (step->stable)
1503  {
1504  /* Check, if we should do a symmetrizised step. */
1505  if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1506  {
1507  int clength = 1<<(tau);
1508  double *a11 = step->a;
1509  double *a12 = a11+clength;
1510  double *a21 = a12+clength;
1511  double *a22 = a21+clength;
1512  /*for (k = 0; k < plength; k++)
1513  {
1514  fprintf(stderr,"fpt_trafo: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",
1515  a11[k],a12[k],a21[k],step->a22[k]);
1516  }*/
1517  /* Multiply third and fourth polynomial with matrix U. */
1518  //fprintf(stderr,"\nhallo\n");
1519  fpt_do_step_symmetric(set->vec3, set->vec4, a11,
1520  a12, a21, a22, step->g, tau, set);
1521  }
1522  else
1523  {
1524  int clength = 1<<(tau+1);
1525  double *a11 = step->a;
1526  double *a12 = a11+clength;
1527  double *a21 = a12+clength;
1528  double *a22 = a21+clength;
1529  /* Multiply third and fourth polynomial with matrix U. */
1530  fpt_do_step(set->vec3, set->vec4, a11, a12,
1531  a21, a22, step->g, tau, set);
1532  }
1533 
1534  if (step->g != 0.0)
1535  {
1536  for (k = 0; k < plength; k++)
1537  {
1538  set->work[plength*2*l+k] += set->vec3[k];
1539  }
1540  }
1541  for (k = 0; k < plength; k++)
1542  {
1543  set->work[plength*(2*l+1)+k] += set->vec4[k];
1544  }
1545  }
1546  else
1547  {
1548  /* Stabilize. */
1549 
1550  /* The lengh of the polynomials */
1551  plength_stab = step->Ns;
1552  t_stab = step->ts;
1553 
1554  /*---------*/
1555  /*fprintf(stderr,"\nfpt_trafo: stabilizing at tau = %d, l = %d.\n",tau,l);
1556  fprintf(stderr,"\nfpt_trafo: plength_stab = %d.\n",plength_stab);
1557  fprintf(stderr,"\nfpt_trafo: tk = %d.\n",tk);
1558  fprintf(stderr,"\nfpt_trafo: index = %d.\n",tk-tau-1);*/
1559  /*---------*/
1560 
1561  /* Set rest of vectors explicitely to zero */
1562  /*fprintf(stderr,"fpt_trafo: stabilizing: plength = %d, plength_stab = %d\n",
1563  plength, plength_stab);*/
1564  memset(&set->vec3[plength/2],0U,(plength_stab-plength/2)*sizeof(double _Complex));
1565  memset(&set->vec4[plength/2],0U,(plength_stab-plength/2)*sizeof(double _Complex));
1566 
1567  /* Multiply third and fourth polynomial with matrix U. */
1568  /* Check for symmetry. */
1569  if (set->flags & FPT_AL_SYMMETRY)
1570  {
1571  if (m <= 1)
1572  {
1573  int clength_1 = plength_stab;
1574  int clength_2 = plength_stab;
1575  double *a11 = step->a;
1576  double *a12 = a11+clength_1;
1577  double *a21 = a12+clength_1;
1578  double *a22 = a21+clength_2;
1579  fpt_do_step_symmetric(set->vec3, set->vec4, a11, a12,
1580  a21, a22, step->g, t_stab-1, set);
1581  }
1582  else if (m%2 == 0)
1583  {
1584  int clength = plength_stab/2;
1585  double *a11 = step->a;
1586  double *a12 = a11+clength;
1587  double *a21 = NULL;
1588  double *a22 = NULL;
1589  fpt_do_step_symmetric_u(set->vec3, set->vec4, a11, a12,
1590  a21, a22,
1591  set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1592  }
1593  else
1594  {
1595  int clength = plength_stab/2;
1596  double *a11 = NULL;
1597  double *a12 = NULL;
1598  double *a21 = step->a;
1599  double *a22 = a21+clength;
1600  fpt_do_step_symmetric_l(set->vec3, set->vec4,
1601  a11, a12,
1602  a21,
1603  a22, set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1604  }
1605  }
1606  else
1607  {
1608  int clength_1 = plength_stab;
1609  int clength_2 = plength_stab;
1610  double *a11 = step->a;
1611  double *a12 = a11+clength_1;
1612  double *a21 = a12+clength_1;
1613  double *a22 = a21+clength_2;
1614  fpt_do_step(set->vec3, set->vec4, a11, a12,
1615  a21, a22, step->g, t_stab-1, set);
1616  }
1617 
1618  if (step->g != 0.0)
1619  {
1620  for (k = 0; k < plength_stab; k++)
1621  {
1622  set->result[k] += set->vec3[k];
1623  }
1624  }
1625 
1626  for (k = 0; k < plength_stab; k++)
1627  {
1628  set->result[Nk+k] += set->vec4[k];
1629  }
1630  }
1631  }
1632  /* Double length of polynomials. */
1633  plength = plength<<1;
1634 
1635  /*--------*/
1636  /*for (k = 0; k < 2*Nk; k++)
1637  {
1638  fprintf(stderr,"work[%2d] = %le + I*%le\tresult[%2d] = %le + I*%le\n",
1639  k,creal(set->work[k]),cimag(set->work[k]),k,creal(set->result[k]),
1640  cimag(set->result[k]));
1641  }*/
1642  /*--------*/
1643  }
1644 
1645  /* Add the resulting cascade coeffcients to the coeffcients accumulated from
1646  * the stabilization steps. */
1647  for (k = 0; k < 2*Nk; k++)
1648  {
1649  set->result[k] += set->work[k];
1650  }
1651 
1652  /* The last step. Compute the Chebyshev coeffcients c_k^n from the
1653  * polynomials in front of P_0^n and P_1^n. */
1654  y[0] = data->gamma_m1*(set->result[0] + data->beta_0*set->result[Nk] +
1655  data->alpha_0*set->result[Nk+1]*0.5);
1656  y[1] = data->gamma_m1*(set->result[1] + data->beta_0*set->result[Nk+1]+
1657  data->alpha_0*(set->result[Nk]+set->result[Nk+2]*0.5));
1658  y[k_end-1] = data->gamma_m1*(set->result[k_end-1] +
1659  data->beta_0*set->result[Nk+k_end-1] +
1660  data->alpha_0*set->result[Nk+k_end-2]*0.5);
1661  y[k_end] = data->gamma_m1*(0.5*data->alpha_0*set->result[Nk+k_end-1]);
1662  for (k = 2; k <= k_end-2; k++)
1663  {
1664  y[k] = data->gamma_m1*(set->result[k] + data->beta_0*set->result[Nk+k] +
1665  data->alpha_0*0.5*(set->result[Nk+k-1]+set->result[Nk+k+1]));
1666  }
1667 
1668  if (flags & FPT_FUNCTION_VALUES)
1669  {
1670  y[0] *= 2.0;
1671  fftw_execute_r2r(plan,(double*)y,(double*)y);
1672 #ifdef _OPENMP
1673  #pragma omp critical (nfft_omp_critical_fftw_plan)
1674 #endif
1675  fftw_destroy_plan(plan);
1676  for (k = 0; k <= k_end; k++)
1677  {
1678  y[k] *= 0.5;
1679  }
1680  }
1681 }
1682 
1683 void fpt_transposed_direct(fpt_set set, const int m, double _Complex *x,
1684  double _Complex *y, const int k_end, const unsigned int flags)
1685 {
1686  int j;
1687  fpt_data *data = &(set->dpt[m]);
1688  int Nk;
1689  int tk;
1690  double norm;
1691 
1692  X(next_power_of_2_exp_int)(k_end+1,&Nk,&tk);
1693  norm = 2.0/(Nk<<1);
1694 
1695  if (set->flags & FPT_NO_DIRECT_ALGORITHM)
1696  {
1697  return;
1698  }
1699 
1700  if (flags & FPT_FUNCTION_VALUES)
1701  {
1702  for (j = 0; j <= k_end; j++)
1703  {
1704  set->xc_slow[j] = cos((KPI*(j+0.5))/(k_end+1));
1705  }
1706 
1707  eval_sum_clenshaw_transposed(k_end, k_end, set->result, set->xc_slow,
1708  y, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1709  data->gamma_m1);
1710 
1711  memcpy(x,&set->result[data->k_start],(k_end-data->k_start+1)*
1712  sizeof(double _Complex));
1713  }
1714  else
1715  {
1716  memcpy(set->result,y,(k_end+1)*sizeof(double _Complex));
1717  memset(&set->result[k_end+1],0U,(Nk-k_end-1)*sizeof(double _Complex));
1718 
1719  for (j = 0; j < Nk; j++)
1720  {
1721  set->result[j] *= norm;
1722  }
1723 
1724  fftw_execute_r2r(set->plans_dct3[tk-2],(double*)set->result,
1725  (double*)set->result);
1726 
1727  if (set->N > 1024)
1728  eval_sum_clenshaw_transposed_ld(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
1729  set->result, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1730  data->gamma_m1);
1731  else
1732  eval_sum_clenshaw_transposed(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
1733  set->result, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1734  data->gamma_m1);
1735 
1736  memcpy(x,&set->temp[data->k_start],(k_end-data->k_start+1)*sizeof(double _Complex));
1737  }
1738 }
1739 
1740 void fpt_transposed(fpt_set set, const int m, double _Complex *x,
1741  double _Complex *y, const int k_end, const unsigned int flags)
1742 {
1743  /* Get transformation data. */
1744  fpt_data *data = &(set->dpt[m]);
1746  int Nk;
1748  int tk;
1750  int k_start_tilde;
1752  int k_end_tilde;
1753 
1755  int tau;
1757  int firstl;
1759  int lastl;
1761  int l;
1763  int plength;
1765  int plength_stab;
1767  fpt_step *step;
1769  fftw_plan plan;
1770  int length = k_end+1;
1771  fftw_r2r_kind kinds[2] = {FFTW_REDFT10,FFTW_REDFT10};
1773  int k;
1774  int t_stab;
1775 
1776  /* Check, if slow transformation should be used due to small bandwidth. */
1777  if (k_end < FPT_BREAK_EVEN)
1778  {
1779  /* Use NDSFT. */
1780  fpt_transposed_direct(set, m, x, y, k_end, flags);
1781  return;
1782  }
1783 
1784  X(next_power_of_2_exp_int)(k_end,&Nk,&tk);
1785  k_start_tilde = K_START_TILDE(data->k_start,Nk);
1786  k_end_tilde = K_END_TILDE(k_end,Nk);
1787 
1788  /* Check if fast transform is activated. */
1789  if (set->flags & FPT_NO_FAST_ALGORITHM)
1790  {
1791  return;
1792  }
1793 
1794  if (flags & FPT_FUNCTION_VALUES)
1795  {
1796 #ifdef _OPENMP
1797  int nthreads = X(get_num_threads)();
1798 #pragma omp critical (nfft_omp_critical_fftw_plan)
1799 {
1800  fftw_plan_with_nthreads(nthreads);
1801 #endif
1802  plan = fftw_plan_many_r2r(1, &length, 2, (double*)set->work, NULL, 2, 1,
1803  (double*)set->work, NULL, 2, 1, kinds, 0U);
1804 #ifdef _OPENMP
1805 }
1806 #endif
1807  fftw_execute_r2r(plan,(double*)y,(double*)set->result);
1808 #ifdef _OPENMP
1809  #pragma omp critical (nfft_omp_critical_fftw_plan)
1810 #endif
1811  fftw_destroy_plan(plan);
1812  for (k = 0; k <= k_end; k++)
1813  {
1814  set->result[k] *= 0.5;
1815  }
1816  }
1817  else
1818  {
1819  memcpy(set->result,y,(k_end+1)*sizeof(double _Complex));
1820  }
1821 
1822  /* Initialize working arrays. */
1823  memset(set->work,0U,2*Nk*sizeof(double _Complex));
1824 
1825  /* The last step is now the first step. */
1826  for (k = 0; k <= k_end; k++)
1827  {
1828  set->work[k] = data->gamma_m1*set->result[k];
1829  }
1830  //memset(&set->work[k_end+1],0U,(Nk+1-k_end)*sizeof(double _Complex));
1831 
1832  set->work[Nk] = data->gamma_m1*(data->beta_0*set->result[0] +
1833  data->alpha_0*set->result[1]);
1834  for (k = 1; k < k_end; k++)
1835  {
1836  set->work[Nk+k] = data->gamma_m1*(data->beta_0*set->result[k] +
1837  data->alpha_0*0.5*(set->result[k-1]+set->result[k+1]));
1838  }
1839  if (k_end<Nk)
1840  {
1841  memset(&set->work[k_end],0U,(Nk-k_end)*sizeof(double _Complex));
1842  }
1843 
1845  memcpy(set->result,set->work,2*Nk*sizeof(double _Complex));
1846 
1847  /* Compute the remaining steps. */
1848  plength = Nk;
1849  for (tau = tk-1; tau >= 1; tau--)
1850  {
1851  /* Compute first l. */
1852  firstl = FIRST_L(k_start_tilde,plength);
1853  /* Compute last l. */
1854  lastl = LAST_L(k_end_tilde,plength);
1855 
1856  /* Compute the multiplication steps. */
1857  for (l = firstl; l <= lastl; l++)
1858  {
1859  /* Initialize second half of coefficient arrays with zeros. */
1860  memcpy(set->vec3,&(set->work[(plength/2)*(4*l+0)]),plength*sizeof(double _Complex));
1861  memcpy(set->vec4,&(set->work[(plength/2)*(4*l+2)]),plength*sizeof(double _Complex));
1862 
1863  memcpy(&set->work[(plength/2)*(4*l+1)],&(set->work[(plength/2)*(4*l+2)]),
1864  (plength/2)*sizeof(double _Complex));
1865 
1866  /* Get matrix U_{n,tau,l} */
1867  step = &(data->steps[tau][l]);
1868 
1869  /* Check if step is stable. */
1870  if (step->stable)
1871  {
1872  if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1873  {
1874  /* Multiply third and fourth polynomial with matrix U. */
1875  int clength = 1<<(tau);
1876  double *a11 = step->a;
1877  double *a12 = a11+clength;
1878  double *a21 = a12+clength;
1879  double *a22 = a21+clength;
1880  fpt_do_step_t_symmetric(set->vec3, set->vec4, a11, a12,
1881  a21, a22, step->g, tau, set);
1882  }
1883  else
1884  {
1885  /* Multiply third and fourth polynomial with matrix U. */
1886  int clength = 1<<(tau+1);
1887  double *a11 = step->a;
1888  double *a12 = a11+clength;
1889  double *a21 = a12+clength;
1890  double *a22 = a21+clength;
1891  fpt_do_step_t(set->vec3, set->vec4, a11, a12,
1892  a21, a22, step->g, tau, set);
1893  }
1894  memcpy(&(set->vec3[plength/2]), set->vec4,(plength/2)*sizeof(double _Complex));
1895 
1896  for (k = 0; k < plength; k++)
1897  {
1898  set->work[plength*(4*l+2)/2+k] = set->vec3[k];
1899  }
1900  }
1901  else
1902  {
1903  /* Stabilize. */
1904  plength_stab = step->Ns;
1905  t_stab = step->ts;
1906 
1907  memcpy(set->vec3,set->result,plength_stab*sizeof(double _Complex));
1908  memcpy(set->vec4,&(set->result[Nk]),plength_stab*sizeof(double _Complex));
1909 
1910  /* Multiply third and fourth polynomial with matrix U. */
1911  if (set->flags & FPT_AL_SYMMETRY)
1912  {
1913  if (m <= 1)
1914  {
1915  int clength_1 = plength_stab;
1916  int clength_2 = plength_stab;
1917  double *a11 = step->a;
1918  double *a12 = a11+clength_1;
1919  double *a21 = a12+clength_1;
1920  double *a22 = a21+clength_2;
1921  fpt_do_step_t_symmetric(set->vec3, set->vec4, a11, a12,
1922  a21, a22, step->g, t_stab-1, set);
1923  }
1924  else if (m%2 == 0)
1925  {
1926  int clength = plength_stab/2;
1927  double *a11 = step->a;
1928  double *a12 = a11+clength;
1929  fpt_do_step_t_symmetric_u(set->vec3, set->vec4, a11, a12,
1930  set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1931  }
1932  else
1933  {
1934  int clength = plength_stab/2;
1935  double *a21 = step->a;
1936  double *a22 = a21+clength;
1937  fpt_do_step_t_symmetric_l(set->vec3, set->vec4,
1938  a21, a22, set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1939  }
1940  }
1941  else
1942  {
1943  int clength_1 = plength_stab;
1944  int clength_2 = plength_stab;
1945  double *a11 = step->a;
1946  double *a12 = a11+clength_1;
1947  double *a21 = a12+clength_1;
1948  double *a22 = a21+clength_2;
1949  fpt_do_step_t(set->vec3, set->vec4, a11, a12,
1950  a21, a22, step->g, t_stab-1, set);
1951  }
1952 
1953  memcpy(&(set->vec3[plength/2]),set->vec4,(plength/2)*sizeof(double _Complex));
1954 
1955  for (k = 0; k < plength; k++)
1956  {
1957  set->work[(plength/2)*(4*l+2)+k] = set->vec3[k];
1958  }
1959  }
1960  }
1961  /* Half the length of polynomial arrays. */
1962  plength = plength>>1;
1963  }
1964 
1965  /* First step */
1966  for (k = 0; k <= k_end_tilde-data->k_start; k++)
1967  {
1968  x[k] = set->work[2*(data->k_start+k)];
1969  }
1970  if (k_end == Nk)
1971  {
1972  x[Nk-data->k_start] =
1973  data->gammaN[tk-2]*set->work[2*(Nk-2)]
1974  + data->betaN[tk-2] *set->work[2*(Nk-1)]
1975  + data->alphaN[tk-2]*set->work[2*(Nk-1)+1];
1976  }
1977 }
1978 
1979 void fpt_finalize(fpt_set set)
1980 {
1981  int tau;
1982  int l;
1983  int m;
1984  int k_start_tilde;
1985  int N_tilde;
1986  int firstl, lastl;
1987  int plength;
1988  const int M = set->M;
1989 
1990  /* TODO Clean up DPT transform data structures. */
1991  if (!(set->flags & FPT_NO_INIT_FPT_DATA))
1992  {
1993  for (m = 0; m < M; m++)
1994  {
1995  /* Check if precomputed. */
1996  fpt_data *data = &set->dpt[m];
1997  if (data->steps != (fpt_step**)NULL)
1998  {
1999  if (!(set->flags & FPT_NO_FAST_ALGORITHM))
2000  {
2001  nfft_free(data->alphaN);
2002  data->alphaN = NULL;
2003  data->betaN = NULL;
2004  data->gammaN = NULL;
2005  }
2006 
2007  /* Free precomputed data. */
2008  k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
2009  /*set->N*/);
2010  N_tilde = N_TILDE(set->N);
2011  plength = 4;
2012  for (tau = 1; tau < set->t; tau++)
2013  {
2014  /* Compute first l. */
2015  firstl = FIRST_L(k_start_tilde,plength);
2016  /* Compute last l. */
2017  lastl = LAST_L(N_tilde,plength);
2018 
2019  /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
2020  for (l = firstl; l <= lastl; l++)
2021  {
2022  /* Free components. */
2023  if (data->steps[tau][l].a != NULL)
2024  {
2025  nfft_free(data->steps[tau][l].a);
2026  data->steps[tau][l].a = NULL;
2027  }
2028  }
2029  /* Free pointers for current level. */
2030  nfft_free(data->steps[tau]);
2031  data->steps[tau] = NULL;
2032  /* Double length of polynomials. */
2033  plength = plength<<1;
2034  }
2035  /* Free steps. */
2036  nfft_free(data->steps);
2037  data->steps = NULL;
2038  }
2039 
2040  if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
2041  {
2042  /* Check, if recurrence coefficients must be copied. */
2043  if (!(set->flags & FPT_PERSISTENT_DATA))
2044  {
2045  if (data->_alpha != NULL)
2046  nfft_free(data->_alpha);
2047  }
2048  data->_alpha = NULL;
2049  data->_beta = NULL;
2050  data->_gamma = NULL;
2051  }
2052  }
2053 
2054  /* Delete array of DPT transform data. */
2055  nfft_free(set->dpt);
2056  set->dpt = NULL;
2057  }
2058 
2059  for (tau = 1; tau < set->t+1; tau++)
2060  {
2061  nfft_free(set->xcvecs[tau-1]);
2062  set->xcvecs[tau-1] = NULL;
2063  }
2064  nfft_free(set->xcvecs);
2065  set->xcvecs = NULL;
2066 
2067  /* Free auxilliary arrays. */
2068  nfft_free(set->work);
2069  nfft_free(set->result);
2070  set->work = NULL;
2071  set->result = NULL;
2072 
2073  /* Free FFTW plans. */
2074  for(tau = 0; tau < set->t/*-1*/; tau++)
2075  {
2076 #ifdef _OPENMP
2077 #pragma omp critical (nfft_omp_critical_fftw_plan)
2078 #endif
2079 {
2080  fftw_destroy_plan(set->plans_dct3[tau]);
2081  fftw_destroy_plan(set->plans_dct2[tau]);
2082 }
2083  set->plans_dct3[tau] = NULL;
2084  set->plans_dct2[tau] = NULL;
2085  }
2086 
2087  nfft_free(set->plans_dct3);
2088  nfft_free(set->plans_dct2);
2089  set->plans_dct3 = NULL;
2090  set->plans_dct2 = NULL;
2091 
2092  /* Check if fast transform is activated. */
2093  if (!(set->flags & FPT_NO_FAST_ALGORITHM))
2094  {
2095  /* Free auxilliary arrays. */
2096  nfft_free(set->vec3);
2097  nfft_free(set->vec4);
2098  nfft_free(set->z);
2099  set->vec3 = NULL;
2100  set->vec4 = NULL;
2101  set->z = NULL;
2102  }
2103 
2104  if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
2105  {
2106  /* Delete arrays of Chebyshev nodes. */
2107  nfft_free(set->xc_slow);
2108  set->xc_slow = NULL;
2109  nfft_free(set->temp);
2110  set->temp = NULL;
2111  }
2112 
2113  /* Free DPT set structure. */
2114  nfft_free(set);
2115 }
static void eval_sum_clenshaw_transposed(int N, int M, double _Complex *a, double *x, double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam, double lambda)
Clenshaw algorithm.
Definition: fpt.c:717
#define K_START_TILDE(x, y)
If defined, perform critical computations in three-term recurrence always in long double instead of u...
Definition: fpt.c:48
#define K_END_TILDE(x, y)
Maximum degree at top of a cascade.
Definition: fpt.c:51
#define FIRST_L(x, y)
Index of first block of four functions at level.
Definition: fpt.c:54
#define LAST_L(x, y)
Index of last block of four functions at level.
Definition: fpt.c:57
#define FPT_PERSISTENT_DATA
If set, TODO complete comment.
Definition: nfft3.h:619
#define FPT_NO_FAST_ALGORITHM
If set, TODO complete comment.
Definition: nfft3.h:617
#define FPT_NO_STABILIZATION
If set, no stabilization will be used.
Definition: nfft3.h:616
*We expand this macro for each supported precision * X
Definition: nfft3.h:99
void fpt_transposed(fpt_set set, const int m, double _Complex *x, double _Complex *y, const int k_end, const unsigned int flags)
Definition: fpt.c:1740
#define FPT_AL_SYMMETRY
If set, TODO complete comment.
Definition: nfft3.h:624
fpt_set fpt_init(const int M, const int t, const unsigned int flags)
Definition: fpt.c:795
void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta, double *gam, int k_start, const double threshold)
Definition: fpt.c:1307
#define FPT_FUNCTION_VALUES
If set, the output are function values at Chebyshev nodes rather than Chebyshev coefficients.
Definition: nfft3.h:623
#define FPT_NO_DIRECT_ALGORITHM
If set, TODO complete comment.
Definition: nfft3.h:618
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition: infft.h:1368
Internal header file for auxiliary definitions and functions.
Header file for the nfft3 library.
Holds data for a single cascade summation.
Definition: fpt.h:46
double * _alpha
< TODO Add comment here.
Definition: fpt.h:56
double * _gamma
TODO Add comment here.
Definition: fpt.h:58
double * gammaN
TODO Add comment here.
Definition: fpt.h:51
fpt_step ** steps
The cascade summation steps
Definition: fpt.h:47
double beta_0
TODO Add comment here.
Definition: fpt.h:53
double * alphaN
TODO Add comment here.
Definition: fpt.h:49
double * betaN
TODO Add comment here.
Definition: fpt.h:50
double gamma_m1
TODO Add comment here.
Definition: fpt.h:54
double alpha_0
TODO Add comment here.
Definition: fpt.h:52
double * _beta
TODO Add comment here.
Definition: fpt.h:57
int k_start
TODO Add comment here.
Definition: fpt.h:48
Holds data for a set of cascade summations.
Definition: fpt.h:66
int t
The exponent of N
Definition: fpt.h:71
double ** xcvecs
Array of pointers to arrays containing the Chebyshev nodes
Definition: fpt.h:73
int M
The number of DPT transforms
Definition: fpt.h:68
unsigned int flags
The flags
Definition: fpt.h:67
fftw_r2r_kind * kinds
Transform kinds for fftw library
Definition: fpt.h:87
fftw_plan * plans_dct2
Transform plans for the fftw library
Definition: fpt.h:85
fpt_data * dpt
The DPT transform data
Definition: fpt.h:72
int N
The transform length.
Definition: fpt.h:69
fftw_r2r_kind * kindsr
Transform kinds for fftw library
Definition: fpt.h:89
fftw_plan * plans_dct3
Transform plans for the fftw library
Definition: fpt.h:83
Holds data for a single multiplication step in the cascade summation.
Definition: fpt.h:31
double * a
The matrix components
Definition: fpt.h:37
bool stable
Indicates if the values contained represent a fast or a slow stabilized step.
Definition: fpt.h:32
int Ns
TODO Add comment here.
Definition: fpt.h:35
int ts
TODO Add comment here.
Definition: fpt.h:36