#ifdef __powerpc__ /* LADIES AND GENTLEMEN, FOR THE FIRST TIME IN HISTORY, I WROTE AN ASM INNER LOOP THAT BEAT GCC'S GENERATED CODE ON INSTRUCTION SCHEDULING! */ register float tempa,tempb,tempc; size rem4 = rem / 4; size remx = rem % 4; while(rem4 > 0) { --rem4; __asm__ volatile("lfs %0,12(%1)" : "=fr"(m) : "r"(lf)); __asm__ volatile("fmuls %0,%1,%2" : "=fr"(m) : "fr"(ym), "fr"(m)); __asm__ volatile("lfs %0,36(%1)" : "=fr"(tempa) : "r"(srcp)); __asm__ volatile("lfs %0,40(%1)" : "=fr"(tempb) : "r"(srcp)); __asm__ volatile("lfs %0,44(%1)" : "=fr"(tempc) : "r"(srcp)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tr) : "fr"(tempa),"fr"(m),"fr"(tr)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tg) : "fr"(tempb),"fr"(m),"fr"(tg)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tb) : "fr"(tempc),"fr"(m),"fr"(tb)); __asm__ volatile("fadds %0,%1,%2" : "=fr"(tm) : "fr"(tm), "fr"(m)); __asm__ volatile("lfs %0,8(%1)" : "=fr"(m) : "r"(lf)); __asm__ volatile("fmuls %0,%1,%2" : "=fr"(m) : "fr"(ym), "fr"(m)); __asm__ volatile("lfs %0,24(%1)" : "=fr"(tempa) : "r"(srcp)); __asm__ volatile("lfs %0,28(%1)" : "=fr"(tempb) : "r"(srcp)); __asm__ volatile("lfs %0,32(%1)" : "=fr"(tempc) : "r"(srcp)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tr) : "fr"(tempa),"fr"(m),"fr"(tr)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tg) : "fr"(tempb),"fr"(m),"fr"(tg)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tb) : "fr"(tempc),"fr"(m),"fr"(tb)); __asm__ volatile("fadds %0,%1,%2" : "=fr"(tm) : "fr"(tm), "fr"(m)); __asm__ volatile("lfs %0,4(%1)" : "=fr"(m) : "r"(lf)); __asm__ volatile("fmuls %0,%1,%2" : "=fr"(m) : "fr"(ym), "fr"(m)); __asm__ volatile("lfs %0,12(%1)" : "=fr"(tempa) : "r"(srcp)); __asm__ volatile("lfs %0,16(%1)" : "=fr"(tempb) : "r"(srcp)); __asm__ volatile("lfs %0,20(%1)" : "=fr"(tempc) : "r"(srcp)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tr) : "fr"(tempa),"fr"(m),"fr"(tr)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tg) : "fr"(tempb),"fr"(m),"fr"(tg)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tb) : "fr"(tempc),"fr"(m),"fr"(tb)); __asm__ volatile("fadds %0,%1,%2" : "=fr"(tm) : "fr"(tm), "fr"(m)); _one: __asm__ volatile("lfs %0,0(%1)" : "=fr"(m) : "r"(lf)); __asm__ volatile("fmuls %0,%1,%2" : "=fr"(m) : "fr"(ym), "fr"(m)); __asm__ volatile("lfs %0,0(%1)" : "=fr"(tempa) : "r"(srcp)); __asm__ volatile("lfs %0,4(%1)" : "=fr"(tempb) : "r"(srcp)); __asm__ volatile("lfs %0,8(%1)" : "=fr"(tempc) : "r"(srcp)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tr) : "fr"(tempa),"fr"(m),"fr"(tr)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tg) : "fr"(tempb),"fr"(m),"fr"(tg)); __asm__ volatile("fmadds %0,%1,%2,%3" : "=fr"(tb) : "fr"(tempc),"fr"(m),"fr"(tb)); __asm__ volatile("fadds %0,%1,%2" : "=fr"(tm) : "fr"(tm), "fr"(m)); _lim: __asm__ volatile("addi %0,%1,16" : "=r"(lf) : "r"(lf)); __asm__ volatile("addi %0,%1,48" : "=r"(srcp) : "r"(srcp)); } if(remx) { size nuremx = remx; remx = 0; goto *(void*)((char*)&&_lim - ((char*)&&_lim-(char*)&&_one) * remx); } #else UNROLL(rem, m = ym * *lf++; tr += *srcp++ * m; tg += *srcp++ * m; tb += *srcp++ * m; tm += m;); #endif