I am trying to run a roms application as OpenMP.
I use ROMS version 3.1 and compile it as OMP using a pgi compiler on a Sun station (linux, 4 proc)
and obtain executable oceanO.
I set env variable OMP_NUM_THREADS to 2 and set the number of tiles to 2 in the roms .in file.
I am trying to run the executable and it reports the following warning:
Warning: OMP_NUM_THREADS or NCPUS (2) greater than available cpus (1)
The thing runs but apparently on one processor.
(and yes, I initially tried to run it on 4).
Any advice on how to make it run on several processors?
OMP_NUM_THREADS or NCPUS (2) greater than available cpus (1)
Re: OMP_NUM_THREADS or NCPUS (2) greater than available cpus (1)
Most likely your problem is related not to the ROMS code you use,
but to the machine and/or its operating system.
Open MP programs can be executed in a single processor machine, but,
of course, threads go sequentially from one synchronization barrier
to the next, resulting in no parallel gain.
To test this you need an Open MP "Hello World!" kind of problem.
Save the following code as file called pie.F and spend fifteen minutes
playing with it.
First, just compile and make sure that it runs, simply
ifort pie.F
or
pgif90 pie.F
then execute a.out and see what happens.
Then recompile it using -openmp flag (do not need to specify any other
flags, it just keep CPUs busy for a few seconds) and run again.
MAKE SURE THAT YOU HAVE GAIN IN SPEED WHEN USING MORE THAN ONE THREAD.
Here is the program;
but to the machine and/or its operating system.
Open MP programs can be executed in a single processor machine, but,
of course, threads go sequentially from one synchronization barrier
to the next, resulting in no parallel gain.
To test this you need an Open MP "Hello World!" kind of problem.
Save the following code as file called pie.F and spend fifteen minutes
playing with it.
First, just compile and make sure that it runs, simply
ifort pie.F
or
pgif90 pie.F
then execute a.out and see what happens.
Then recompile it using -openmp flag (do not need to specify any other
flags, it just keep CPUs busy for a few seconds) and run again.
MAKE SURE THAT YOU HAVE GAIN IN SPEED WHEN USING MORE THAN ONE THREAD.
Here is the program;
Code: Select all
program pie
! 1
! Since integral of f(x) = --------- is arctan(x), therefore
! 1 + x*x
!
! pi = 4 * integral from 0 to 1 of f(x) dx, which leads to the
! method for computing pi based linear, cubic, or quintic integration
! formula, stating that integral of f(x) over small interval dx can
! be approximated with 6th order of accuracy as
!
! f_R + f_L f1_R - f1_L f2_R + f2_L
! dx * ----------- - dx^2 * ------------- + dx^3 * ------------
! 2 10 120
!
! where f_R,L; f1_R,L, and f2_R,L values of function f(x), its first
! and second derivatives at the Right and Left ends of interval dx.
!
#define KIND 16
#define SUM_BY_PAIRS
#define ORDER 8
implicit none
integer trd_count
real(kind=KIND) pi
real(kind=4) CPU_time_ALL(4)
common /scalars/ pi, CPU_time_all, trd_count
integer iclk_start, iclk_end, iclk_rate, iclk_max
call system_clock (iclk_start, iclk_rate, iclk_max)
pi=0.0_16
trd_count=0
C$OMP PARALLEL
call pie_thread
C$OMP END PARALLEL
call system_clock (iclk_end, iclk_rate, iclk_max)
write(*,'(/1x,A,F12.4,1x,A/)') 'Elapsed Wall Clock Time =',
& dble(iclk_end-iclk_start)/dble(iclk_rate), 'sec'
stop
end
subroutine pie_thread
implicit none
integer nmax, numthreads, trd, trd_count, imax,
& npts, istr, iend, i, inc
parameter (nmax=2000000)
real(kind=KIND) Zero, OneTwenty, OneTwelfth, OneNinth,
& OneEighth, OneSixth, OneFifth, ThreeFourteenth, Half,
& ThreeQuarter, One, Three, Four, x,dx, HalfX, OneFifthX,
& OneTwentyX, OneNinthX, ThreeFourteenthX,OneSixthX,
& OneTwelfthX, pi, A(5,0:nmax)
real(kind=4) CPU_time_ALL(4)
common /scalars/ pi,CPU_time_ALL,trd_count /arrays/ A
integer f,f1,f2,f3, sum
parameter (sum=1, f=2, f1=3, f2=4, f3=5)
#if KIND > 8
parameter (Zero=0.0_16, One=1.0_16, OneTwenty=0.05_16,
& OneTwelfth=1.0_16/12.0_16, OneNinth=1.0_16/9.0_16,
& OneEighth=0.125_16, OneSixth=1.0_16/6.0_16,
& OneFifth=0.2_16, Half=0.5_16,
& ThreeFourteenth=3.0_16/14.0_16,ThreeQuarter=0.75_16,
& Three=3.0_16, Four=4.0_16)
#else
parameter (Zero=0.0_8, One=1.0_8, OneTwenty=0.05_8,
& OneTwelfth=1.0_8/12.0_8, OneNinth=1.0_8/9.0_8,
& OneEighth=0.125_8, OneFifth=0.2_8, Half=0.5_8,
& ThreeFourteenth=3.0_8/14.0_8, ThreeQuarter=0.75_8,
& Three=3.0_8, Four=4.0_8)
#endif
character(len=41) diff
C$ real(kind=4) CPU_time(3), etime
C$ integer omp_get_thread_num, omp_get_num_threads
C$ CPU_time(1)=etime(CPU_time(2))
numthreads=1
C$ numthreads=omp_get_num_threads()
trd=0
C$ trd=omp_get_thread_num()
C$OMP CRITICAL
write(*,*) 'thread', trd, ' of', numthreads
C$OMP END CRITICAL
imax=1
do while (imax.lt.nmax)
npts=(imax+numthreads-1)/numthreads
istr=1+trd*npts
iend=min(istr+npts-1,imax)
A(sum,istr)=Zero
if (trd.eq.0) then
A(f,0)=Four
A(f1,0)=Zero
A(f2,0)=-Half*Four*Four
endif
dx=One/imax
do i=istr,iend
x=dx*i
A(f,i) = Four/(One+x*x)
A(f1,i) = -Half*x*A(f,i)*A(f,i)
A(f2,i) = -OneEighth*(One-Three*x*x) *A(f,i)*A(f,i)*A(f,i)
A(f3,i) = -ThreeQuarter*(One-x*x) * A(f1,i) * A(f,i)*A(f,i)
enddo
C$OMP BARRIER
#if ORDER == 2
HalfX=Half*dx
do i=istr,iend
A(sum,i)=HalfX*(A(f,i)+A(f,i-1))
enddo
#elif ORDER == 4
HalfX=Half*dx
OneSixthX=OneSixth*dx
do i=istr,iend
A(sum,i)=HalfX*( A(f,i)+A(f,i-1)
& - OneSixthX*(A(f1,i)-A(f1,i-1)))
enddo
#elif ORDER == 6
HalfX=Half*dx
OneFifthX=OneFifth*dx
OneTwelfthX=OneTwelfth*dx
do i=istr,iend
A(sum,i)=HalfX*( A(f,i)+A(f,i-1) - OneFifthX*(
& A(f1,i)-A(f1,i-1) - OneTwelfthX*(A(f2,i)+A(f2,i-1))
& ))
enddo
#elif ORDER == 8
HalfX=Half*dx
ThreeFourteenthX=ThreeFourteenth*dx
OneNinthX=OneNinth*dx
OneTwentyX=OneTwenty*dx
do i=istr,iend
A(sum,i)=HalfX*( A(f,i)+A(f,i-1) - ThreeFourteenthX*(
& A(f1,i)-A(f1,i-1) - OneNinthX*( A(f2,i)+A(f2,i-1)
& - OneTwentyX*(A(f3,i)-A(f3,i-1))
& )))
enddo
#endif
C$OMP BARRIER
#ifdef SUM_BY_PAIRS
inc=1
do while (istr.le.iend-inc)
do i=istr,iend-inc,2*inc
A(sum,i)=A(sum,i)+A(sum,i+inc)
enddo
inc=2*inc
enddo
#else
do i=istr+1,iend
A(sum,istr)=A(sum,istr)+A(sum,i)
enddo
#endif
C$OMP CRITICAL
trd_count=trd_count+1
pi=pi+A(sum,istr)
if (trd_count.eq.numthreads) then
trd_count=0
write(diff,'(F41.38)') pi
#if KIND > 8
& -3.14159265358979323846264338327950279_16
#else
& -3.14159265358979311_8
#endif
i=1
do while (diff(i:i).ne.'.')
i=i+1
enddo
diff(i:i)=':'
i=i+1
do while (diff(i:i).eq.'0' .and. i.lt.41)
diff(i:i)='.'
i=i+1
enddo
write(*,'(I7,F38.35,A,I4)') imax, pi, diff, trd
pi=0.0_16
endif
C$OMP END CRITICAL
C$OMP BARRIER
c imax=(imax+1)*10.D0**0.5D0
c imax=(imax+1)*10.D0**0.1666666666666666D0
imax=(imax+1)*10.D0**0.1D0
enddo
C$ CPU_time(1) = etime(CPU_time(2)) - CPU_time(1)
C$OMP CRITICAL
C$ write(*,'(A8,I3,A5,F12.3,A5,F10.3,A5,F12.5)')
C$ & 'Thread #', trd, ' CPU:', CPU_time(2),
C$ & ' sys:', CPU_time(3), ' net:', CPU_time(1)
C$ trd_count=trd_count+1
C$ CPU_time_ALL(1)=CPU_time_ALL(1)+CPU_time(1)
C$ CPU_time_ALL(2)=CPU_time_ALL(2)+CPU_time(2)
C$ CPU_time_ALL(3)=CPU_time_ALL(3)+CPU_time(3)
C$ if (trd_count.eq.numthreads) then
C$ trd_count=0
C$ write(*,'(A6,8x,F14.3,3x,F12.3,1x,F14.3)') 'TOTAL:',
C$ & CPU_time_ALL(2), CPU_time_ALL(3), CPU_time_ALL(1)
C$ endif
C$OMP END CRITICAL
return
end