Hello Experts,
I am trying to write a quick function to calculate the RMS values, without using any blocking calls(mainly loops)
This is what I have so far, it is not optimized for speed for which I need your help:
Here is the function:

Code:
/*
 * How to calculate the RMS
 * 0: True RMS - Square all the inputs within the time duration and add them, then take mean and finally square root the value.
 * 1: Approx RMS - Find the peak value within the time duration and divide by sqrt(2)
 */
#define MOTOR_RMS_TECHNEQUE 0    //This can be 0 or 1


#define NONBLOCKING_EXECUTION(a) (uint32_t)(__divf32(a,0.450))        //Timing for all functions running in 450ms Task


typedef struct _PhaseRMS_Vars_t_
{
    /// Temporary Phase U offset
    float32_t phaseU_offset;
    /// Temporary Phase V offset
    float32_t phaseV_offset;
    /// Temporary Phase W offset
    float32_t phaseW_offset;
    /// Phase U RMS
    float32_t phaseU_RMS;
    /// Phase V RMS
    float32_t phaseV_RMS;
    /// Phase W RMS
    float32_t phaseW_RMS;
    /// Phase U Peak
    float32_t phaseU_Peak;
    /// Phase V Peak
    float32_t phaseV_Peak;
    /// Phase W Peak
    float32_t phaseW_Peak;
    /// RMS Counter
    uint32_t rmsCounter;
    /// Sample aquition time
    float32_t time;
    /// phaseU squared and summed
    float32_t phaseUSumSquared;
    /// phaseV squared and summed
    float32_t phaseVSumSquared;
    /// phaseW squared and summed
    float32_t phaseWSumSquared;
} PhaseRMS_Vars_t;
extern PhaseRMS_Vars_t MotorCurrent[2];


void rmsCalc(PhaseRMS_Vars_t *iMotor, MOTOR_Vars_t *pMotor)
{
    //Mid point of bipolar range(0 to 4096)
    const uint16_t offset = 2048;


    //Get the data from the ADC and subtract the offset.
    /*
     * TODO Use the ADC PPB values in future.
     * The PPB values already have the offset subtracted from them.
     * So the offset value subtraction can be eliminated.
     */
    if(pMotor->MotorNum == 0)    //Motor A
    {
        iMotor->phaseU_offset = fabsf(M1_IFB_U - offset);
        iMotor->phaseV_offset = fabsf(M1_IFB_V - offset);
        iMotor->phaseW_offset = fabsf(M1_IFB_W - offset);
    }
    else if(pMotor->MotorNum == 1)    //Motor B
    {
        iMotor->phaseU_offset = fabsf(M2_IFB_U - offset);
        iMotor->phaseV_offset = fabsf(M2_IFB_V - offset);
        iMotor->phaseW_offset = fabsf(M2_IFB_W - offset);
    }


    //If speed is zero, div/0 error will occur, so make sure the RMS will only be calculated once the motor is running.
    if(pMotor->speed.SpeedRpm > 50)
    {
        //N = 120*F/P
        //From frequency(F) calculate the time period.
        iMotor->time = __divf32(1.0,__divf32(pMotor->speed.SpeedRpm * 4,120.0)) * 1000;  //time in ms
    }
    else
    {
        iMotor->time = 0;
    }
    //Convert time is milliseconds to counter ticks
    const uint16_t maxCount = NONBLOCKING_EXECUTION(iMotor->time);


    //Increment the counter
    iMotor->rmsCounter++;
#if(MOTOR_RMS_TECHNEQUE == 0)
    //Find the peak Value
    if(iMotor->rmsCounter < maxCount)
    {
        if(iMotor->phaseU_offset > iMotor->phaseU_Peak)
        {
            iMotor->phaseU_Peak = iMotor->phaseU_offset;
        }
        if(iMotor->phaseV_offset > iMotor->phaseV_Peak)
        {
            iMotor->phaseV_Peak = iMotor->phaseV_offset;
        }
        if(iMotor->phaseW_offset > iMotor->phaseW_Peak)
        {
            iMotor->phaseW_Peak = iMotor->phaseW_offset;
        }
    }
    else
    {
        /* Calculate RMS
         * This is still the RMS in terms of ADC count.
         * Still need to convert count to voltage, and finally voltage to actual current(To be done later)
         * */
        iMotor->phaseU_RMS = iMotor->phaseU_Peak * 0.7072f;
        iMotor->phaseV_RMS = iMotor->phaseV_Peak * 0.7072f;
        iMotor->phaseW_RMS = iMotor->phaseW_Peak * 0.7072f;
        iMotor->rmsCounter = 0;
        iMotor->phaseU_Peak = 0;
        iMotor->phaseV_Peak = 0;
        iMotor->phaseW_Peak = 0;
    }
#elif(MOTOR_RMS_TECHNEQUE == 1)
    if(iMotor->rmsCounter < maxCount)
    {
        //pow(iMotor->phaseU_offset,2) uses a lot of stack
        iMotor->phaseUSumSquared += (iMotor->phaseU_offset * iMotor->phaseU_offset);
        iMotor->phaseVSumSquared += (iMotor->phaseV_offset * iMotor->phaseV_offset);
        iMotor->phaseWSumSquared += (iMotor->phaseW_offset * iMotor->phaseW_offset);
    }
    else
    {
        /* Calculate RMS
         * This is still the RMS in terms of ADC count.
         * Still need to convert count to voltage, and finally voltage to actual current(To be done later)
         * */


        //The mean of the squared values is calculated and then the square root is found.
        iMotor->phaseU_RMS = __sqrt(__divf32(iMotor->phaseUSumSquared, maxCount));
        iMotor->phaseV_RMS = __sqrt(__divf32(iMotor->phaseVSumSquared, maxCount));
        iMotor->phaseW_RMS = __sqrt(__divf32(iMotor->phaseWSumSquared, maxCount));
        iMotor->phaseUSumSquared = 0;
        iMotor->phaseVSumSquared = 0;
        iMotor->phaseWSumSquared = 0;
        iMotor->rmsCounter = 0;
    }
#endif
}