POWER FORTRAN Accelerator

Kenneth Josiah Harris kj at sgi.com
Tue May 8 09:47:54 AEST 1990


In article <9019 at hydra.gatech.EDU> ccsupeh at prism.gatech.EDU (Eric Hoffman) writes:
>

>Does anyone have any experience with the Power Fortran Accelerator?
>I have been trying to run PFA on a POWER SERIES 4D/220S with 2 CPU 
>without any success: it slows down my codes.
>I am using a fairly simple code to do tests. Whatever (-On) option I use,
>(-pfa) increases both clock and cpu time.
>Any clue?
>
>The test code follows:
>
>
>      PROGRAM test_mat
>c size: nmaxs
>      parameter(nmaxs=1024,mmaxs=1024)
>      real alpha(nmaxs),beta(mmaxs),o(nmaxs,mmaxs)
>      data m,n/1024,1024/
>      time0=second()
>      do 20 j=1,n
>      do 10 i=1,m
>      o(i,j)=float(i)+float(j)
> 10   continue
c!!!!!!!!!!!!!!!!!!!!!!!!!!!
c is this what you wanted:
	alpha(j)=float(j)
>      alpha(i)=float(i)
> 20   continue
>      do 30 i=1,120
>      call HSMVPS(M,N,o,nmaxs,alpha,beta,IER)
> 30   continue
>      time1=second()
>      write(6,*)' Total : ',time1-time0
>      stop
>      end
>      subroutine HSMVPS(M,N,A,NROWA,X,Y,IER)
>      REAL A(NROWA,N),x(n),y(m)
>      do 29 j=1,n
>      do 27 i=1,m
>      y(i)=y(i)+a(i,j)*x(j)
> 27   continue
> 29   continue
>      end


	Basically, there is a data dependence in "y(i)=y(i)+a(i,j)*x(j)".
This is called a "sum reduction".

This exact example is discussed in "FORTRAN 77 Programmer's Guide",
Section 5.6, example 3, page 5-24.

	You can break the data dependence a couple of ways.  One is to
interchange the loops.  The new PFA will do sum reductions automatically.

	If you use "-pfa keep" in the compile line, you can get a listing
file that you can use to determine what the preprocessor thinks.

	You can use prof and pixie to find where the code is spending time.

	You can use "fsplit" to put each subroutine in a separate file
so that you can use PFA on just some subroutines.

	Here is a makefile that I use for benchmarks, it has a lot of the
flags and prof and pixie stuff built in:


#!/usr/bin/make
#PIXFLAGS = -quit 1% -invoc -only sin -only pow
PIXFLAGS = -quit 1%
PFAOPT = -o=5 -roundoff=2
FPFA = -pfa keep -WK,-o=5,-roundoff=2
#FPFA = -mp
##### FLAGS: flags for both C and FORTRAN #####
#FLAGS = -O2 -G 8200 -static
FLAGS = -O2 -G 8200
##### FFLAGS: flags for FORTRAN #####
# -u # no implicit declarations
# -w0 # Do not suppress the warning message for unused variables.
# -static  # Cause all local variables to be statically allocated.
FFLAGS = $(FPFA) $(FLAGS) -u
##### CFLAGS: flags for C #####
# -float # don't promote to double
CFLAGS = $(FLAGS)
COMPILER = $(FC) $(FFLAGS)
##COMPILER = $(CC) $(CFLAGS)
LDFLAGS = -bestGnum -Wl,-L$(TOOLROOT)/usr/lib

MAIN = mat

#OBJ = $(SRC:.f=.o)
OBJ = mat.o

UCODE = $(OBJ:.o=.u)

############################################################################
default:	$(MAIN)
	timex $(MAIN)

############################################################################
## rearranges procedures in an executable to facilitate better cache mapping.
$(MAIN).cord:	$(MAIN).reorder $(MAIN).rel
	$(TOOLROOT)/usr/lib/cord -o $(MAIN).cord $(MAIN).rel $(MAIN).reorder

## make an executeable with the relocation info intact
$(MAIN).rel:	$(OBJ)
	timex $(COMPILER) -Wl,-r,-d,-z $(OBJ) $(LDFLAGS) -o $@

$(MAIN).reorder:	$(MAIN).fd
	$(TOOLROOT)/usr/lib/ftoc $(MAIN).fd > $(MAIN).reorder

## generate feedback file
$(MAIN).fd:	$(MAIN) $(MAIN).pixflag
	prof -pixie -feedback $(MAIN).fd $(MAIN)

############################################################################
## generate pixie reports
$(PROFDIR)/$(MAIN).pix.out:	$(MAIN) $(MAIN).pixflag
	prof -pixie $(PIXFLAGS) $(MAIN) \
		$(MAIN).Addrs $(MAIN).Counts* > $@
##	doprof $(MAIN) pix

## run program to get pixie statistics
# run program.  makes $(MAIN).Counts$pid
## note: MP with pixie and graphics will crash window manager. ##
$(MAIN).pixflag:	$(MAIN).pixie
	set nonomatch ; rm -f $(MAIN).Counts*
	touch $(MAIN).pixflag
	timex $(MAIN).pixie

## make an executable with basic block counting
# also makes $(MAIN).Addrs
$(MAIN).pixie:	$(MAIN)
	pixie $(MAIN)

############################################################################
## generate prof report
$(PROFDIR)/$(MAIN).prof.out:	$(MAIN).profflag
	prof $(MAIN).prof *.$(MAIN).prof > $@
##	doprof $(MAIN) prof

## run program to get prof statistics
# run program.  makes $$.$(MAIN).prof
$(MAIN).profflag:	$(MAIN).prof
	set nonomatch ; rm -f *.$(MAIN).prof
	touch $(MAIN).profflag
	timex $(MAIN).prof

## make an executable with PC sampling
$(MAIN).prof:	$(OBJ)
	timex $(COMPILER) $(OBJ) -p $(LDFLAGS) -o $@

############################################################################
## make an executable at optimiztion level 3
#$(MAIN):	$(UCODE)
#	timex $(COMPILER) $(UCODE) $(LDFLAGS) -o $@

## make an executable at optimiztion level 0,1,2
$(MAIN):	$(OBJ)
	timex $(COMPILER) $(OBJ) $(LDFLAGS) -o $@

############################################################################
clean:
	set nonomatch ; rm -f $(MAIN)
	set nonomatch ; rm -f $(OBJ)
	set nonomatch ; rm -f $(UCODE)
	set nonomatch ; rm -f $(MAIN).pixie $(MAIN).Addrs $(MAIN).pix.out
	set nonomatch ; rm -f *.$(MAIN).prof
	set nonomatch ; rm -f $(MAIN).Counts*
	set nonomatch ; rm -f $(MAIN).prof $(MAIN).prof.out
	set nonomatch ; rm -f a.out core nohup.out
	set nonomatch ; rm -f $(MAIN).cord $(MAIN).reorder $(MAIN).fd $(MAIN).rel
	set nonomatch ; rm -f $(MAIN).pixflag $(MAIN).profflag
	set nonomatch ; rm -f *.l *.m
	set nonomatch ; rm -f $(MAIN).pix.*
	set nonomatch ; rm -f $(MAIN).prof.*

.SUFFIXES:	.m .u .s

.f.u:
	$(COMPILER) -j $<

.f.m:
	$(TOOLROOT)/usr/lib/pfa -L=$*.l -F=$@ -I=$< $(PFAOPT)

## to strip out stuff: grep -v "^#" < file.f

.f.s:
	$(COMPILER) -S $<
--
Ken J. Harris -- kj at sgi.com or {decwrl,pyramid,ucbvax}!sgi!kj



More information about the Comp.sys.sgi mailing list