126. openMP¶
如果在代码中展开了太多的for循环
diff --git a/deconvolution/clarkloop.cpp b/deconvolution/clarkloop.cpp
index 96fb91f..e017150 100644
--- a/deconvolution/clarkloop.cpp
+++ b/deconvolution/clarkloop.cpp
@@ -86,7 +86,7 @@ boost::optional<double> ClarkLoop::Run(ImageSet& convolvedResidual, const ao::uv
double* image = _clarkModel.Residual()[imgIndex];
const double* psf = doubleConvolvedPsfs[_clarkModel.Residual().PSFIndex(imgIndex)];
double psfFactor = componentValues[imgIndex];
- #pragma omp parallel for
+ //#pragma omp parallel for
for(size_t px=0; px <_clarkModel.size(); ++px)
{
int psfX = _clarkModel.X(px) - x + _width/2;
由于存在数据依赖,循环无法展开,cpu会被占满
[user1@taishan-arm-cpu03 perf]$ htop
1 [||||||||||||| 60.5%] 25 [||||||||||||| 58.6%] 49 [|||||||||||||| 60.3%] 73 [||||||||||||| 60.5%]
2 [|||||||||||| 56.6%] 26 [||||||||||||| 57.6%] 50 [||||||||||||| 61.7%] 74 [||||||||||||| 59.2%]
3 [|||||||||||||| 64.8%] 27 [||||||||||||| 57.9%] 51 [||||||||||||| 59.9%] 75 [||||||||||||| 61.2%]
4 [||||||||||||| 57.1%] 28 [||||||||||||| 59.2%] 52 [|||||||||||||| 61.7%] 76 [|||||||||||| 58.9%]
5 [||||||||||||| 56.2%] 29 [||||||||||||| 60.0%] 53 [||||||||||||| 60.3%] 77 [||||||||||||| 62.2%]
6 [||||||||||||| 55.9%] 30 [||||||||||||| 58.7%] 54 [||||||||||||| 58.9%] 78 [||||||||||||| 61.0%]
7 [||||||||||||| 57.9%] 31 [|||||||||||||| 60.8%] 55 [||||||||||||| 60.0%] 79 [||||||||||||| 60.3%]
8 [||||||||||||| 56.8%] 32 [||||||||||||| 58.4%] 56 [|||||||||||| 58.9%] 80 [|| 2.0%]
9 [||||||||||||| 59.7%] 33 [||||||||||||| 60.5%] 57 [||||||||||||| 61.2%] 81 [||||||||||||| 60.0%]
10 [||||||||||||| 58.1%] 34 [|||||||||||||| 61.0%] 58 [||||||||||||| 60.0%] 82 [||||||||||||| 58.4%]
11 [||||||||||||| 57.0%] 35 [||||||||||||| 59.7%] 59 [||||||||||||| 59.7%] 83 [||||||||||||| 60.5%]
12 [||||||||||||| 56.2%] 36 [|||||||||||| 58.6%] 60 [||||||||||||| 59.2%] 84 [||||||||||||| 60.8%]
13 [|||||||||||||||69.8%] 37 [||||||||||||| 59.7%] 61 [|||||||||||| 59.5%] 85 [||||||||||||| 58.7%]
14 [|||||||||||| 56.3%] 38 [||||||||||||| 59.9%] 62 [|||||||||||||| 60.3%] 86 [ 0.0%]
15 [|||||||||||| 56.2%] 39 [||||||||||||| 59.7%] 63 [||||||||||||| 59.7%] 87 [||||||||||||| 60.0%]
16 [||||||||||||| 56.2%] 40 [|||||||||||| 58.2%] 64 [||||||||||||| 59.7%] 88 [||||||||||||| 59.5%]
17 [||||||||||||| 56.2%] 41 [||||||||||||| 58.4%] 65 [||||||||||||| 59.2%] 89 [||||||||||||| 58.7%]
18 [||||||||||||| 60.9%] 42 [||||||||||||| 59.6%] 66 [|||||||||||| 57.7%] 90 [||||||||||||| 60.5%]
19 [|||||||||||| 56.5%] 43 [||||||||||||| 59.9%] 67 [||||||||||||| 60.0%] 91 [ 0.0%]
20 [|||||||||||| 59.6%] 44 [||||||||||||| 57.2%] 68 [||||||||||||| 60.0%] 92 [||||||||||||| 58.9%]
21 [|||||||||||| 57.1%] 45 [||||||||||||| 59.4%] 69 [||||||||||||| 57.8%] 93 [||||||||||||| 59.5%]
22 [|||||||||||| 54.3%] 46 [||||||||||||| 60.1%] 70 [||||||||||||| 60.3%] 94 [|||||||||||| 58.9%]
23 [||||||||||||| 58.4%] 47 [||||||||||||| 60.5%] 71 [||||||||||||| 60.0%] 95 [||||||||||||| 58.1%]
24 [|||||||||||| 55.2%] 48 [||||||||||||| 59.6%] 72 [||||||||||||| 60.5%] 96 [||||||||||||| 58.3%]
Mem[||||| 28.8G/1021G] Tasks: 45, 319 thr; 57 running
Swp[ 0K/16.0G] Load average: 37.59 39.73 30.82
Uptime: 5 days, 18:12:00
PID CPU USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command
52123 13 sjtu_chif 20 0 16.4G 9551M 32064 R 5575 0.9 18h33:20 /home/user1/sourcecode/wsclean-2.7/build/wscle
57611 63 sjtu_chif 20 0 16.4G 9551M 32064 S 59.7 0.9 9:26.69 /home/user1/sourcecode/wsclean-2.7/build/wscle
57632 76 sjtu_chif 20 0 16.4G 9551M 32064 S 59.7 0.9 9:35.22 /home/user1/sourcecode/wsclean-2.7/build/wscle
57643 70 sjtu_chif 20 0 16.4G 9551M 32064 R 59.7 0.9 9:34.92 /home/user1/sourcecode/wsclean-2.7/build/wscle
57677 77 sjtu_chif 20 0 16.4G 9551M 32064 R 59.7 0.9 9:21.89 /home/user1/sourcecode/wsclean-2.7/build/wscle
57678 92 sjtu_chif 20 0 16.4G 9551M 32064 R 60.3 0.9 9:33.56 /home/user1/sourcecode/wsclean-2.7/build/wscle
57658 67 sjtu_chif 20 0 16.4G 9551M 32064 R 59.7 0.9 9:38.84 /home/user1/sourcecode/wsclean-2.7/build/wscle
57651 90 sjtu_chif 20 0 16.4G 9551M 32064 S 59.0 0.9 9:22.89 /home/user1/sourcecode/wsclean-2.7/build/wscle
57668 57 sjtu_chif 20 0 16.4G 9551M 32064 S 60.3 0.9 9:18.94 /home/user1/sourcecode/wsclean-2.7/build/wscle
F1Help F2Setup F3SearchF4FilterF5Tree F6SortByF7Nice -F8Nice +F9Kill F10Quit
在热点中也可以看到
Samples: 2M of event 'cycles:ppp', 4000 Hz, Event count (approx.): 805718621249
Overhead Shared Object Symbol
44.99% libgomp.so.1.0.0 [.] gomp_barrier_wait_end
43.95% libgomp.so.1.0.0 [.] gomp_team_barrier_wait_end
5.59% [kernel] [k] queued_spin_lock_slowpath
0.80% libgomp.so.1.0.0 [.] gomp_barrier_wait
0.75% libgomp.so.1.0.0 [.] gomp_team_barrier_wait_final
0.65% [kernel] [k] arch_cpu_idle
0.54% [kernel] [k] finish_task_switch
栈区的情况, thread1起了很多线程
Thread 2 (Thread 0xfffcba85f050 (LWP 57701)):
#0 0x0000ffff8632a6f0 in syscall () from /lib64/libc.so.6
#1 0x0000ffff8643abe4 in futex_wait (val=6569840, addr=<optimized out>) at ../.././libgomp/config/linux/futex.h:45
#2 do_wait (val=6569840, addr=<optimized out>) at ../.././libgomp/config/linux/wait.h:67
#3 gomp_barrier_wait_end (bar=<optimized out>, state=6569840) at ../.././libgomp/config/linux/bar.c:48
#4 0x0000ffff864382d8 in gomp_simple_barrier_wait (bar=<optimized out>) at ../.././libgomp/config/posix/simple-bar.h:60
#5 gomp_thread_start (xdata=<optimized out>) at ../.././libgomp/team.c:127
#6 0x0000ffff86677c48 in start_thread () from /lib64/libpthread.so.0
#7 0x0000ffff8632f600 in thread_start () from /lib64/libc.so.6
Thread 1 (Thread 0xffff85a19020 (LWP 52123)):
#0 0x0000ffff8632a6f0 in syscall () from /lib64/libc.so.6
#1 0x0000ffff8643ae74 in futex_wait (val=6569832, addr=<optimized out>) at ../.././libgomp/config/linux/futex.h:45
#2 do_wait (val=6569832, addr=<optimized out>) at ../.././libgomp/config/linux/wait.h:67
#3 gomp_team_barrier_wait_end (bar=<optimized out>, state=6569832) at ../.././libgomp/config/linux/bar.c:112
#4 0x0000ffff8643afe4 in gomp_team_barrier_wait_final (bar=<optimized out>) at ../.././libgomp/config/linux/bar.c:136
#5 0x0000ffff8643949c in gomp_team_end () at ../.././libgomp/team.c:934
#6 0x00000000005bea8c in ClarkLoop::Run (this=this@entry=0xffffc9191190, convolvedResidual=..., doubleConvolvedPsfs=...) at /home/user1/sourcecode/wsclean-2.7/deconvolution/clarkloop.cpp:89
#7 0x00000000004de618 in GenericClean::ExecuteMajorIteration (this=<optimized out>, dirtySet=..., modelSet=..., psfs=..., width=4000, height=4000, reachedMajorThreshold=@0xffffc9191ef0: true) at /home/user1/sourcecode/wsclean-2.7/deconvolution/genericclean.cpp:81
#8 0x00000000004f8d54 in ParallelDeconvolution::ExecuteMajorIteration (this=this@entry=0xffffc91936e8, dataImage=..., modelImage=..., psfImages=..., reachedMajorThreshold=@0xffffc9191ef0: true) at /home/user1/sourcecode/wsclean-2.7/deconvolution/paralleldeconvolution.cpp:164
#9 0x00000000004cdc4c in Deconvolution::Perform (this=this@entry=0xffffc91936e0, groupTable=..., reachedMajorThreshold=@0xffffc9191ef0: true, majorIterationNr=4) at /home/user1/sourcecode/wsclean-2.7/deconvolution/deconvolution.cpp:142
#10 0x0000000000482408 in WSClean::runIndependentGroup (this=this@entry=0xffffc91927f0, groupTable=..., primaryBeam=...) at /home/user1/sourcecode/wsclean-2.7/wsclean/wsclean.cpp:727
#11 0x000000000048afb0 in WSClean::RunClean (this=0xffffc91927f0) at /home/user1/sourcecode/wsclean-2.7/wsclean/wsclean.cpp:472
#12 0x0000000000461ff8 in CommandLine::Run (wsclean=...) at /home/user1/sourcecode/wsclean-2.7/wsclean/commandline.cpp:1308
#13 0x0000000000454aac in main (argc=32, argv=0xffffc9193a08) at /home/user1/sourcecode/wsclean-2.7/wscleanmain.cpp:13
完整的栈区情况请查看 52123