function [Out, OutPadded, OutInt] = FFT_VecByCirc_IP(a_data, b_data_pos, p, varargin)
    InputWidth = 16;
    
    if nargin > 3
        extra_bits = varargin{1};
    else 
         extra_bits = 2;
    end
    
    round_mode = 'fix';
    
    %% Calculate the number of samples that we have to reach
    % Obtain the nearse pow2 number of samples.
    NSamplesLog2 = nextpow2(p);
    
    % If the input vector is less than 8 bit we make it 8 bit because the
    % IP FFT can only handle a num of samples >= 8
    if NSamplesLog2 < 3
        NSamplesLog2 = 3;
    end 
    
    NSamples = 2^(NSamplesLog2);
        
    % NOTE: This is an hack, sometimes L contains a 0 that become a  -1 when we transponse 
    % it. In this case it would give us boggues elements that fucks my system. 
    % So let's remove any 0 from L.
    b_data_pos(b_data_pos == -1) = [];
    
    % we need to adjust the value ad we need the transponse of it if we
    % wanna use the same input as the normal case. This is due to the fact 
    % that out matrix is circulan on the rows while this method and the
    % literature have circulant matrices on column. So we need to
    % transponse the matrix and this is the trick that allow us to
    % transponse the index. We add one because fucking matlab counting the 
    % vector elements starting from 1.... argh
    b_data_pos_tran = mod(p-(b_data_pos), p);
    b_data = VectorFromPos(b_data_pos_tran+1, p);
    
    % We have to pad the input vector with 0 in order to reach the required 
    % input size that match the numberb_data_pos_sorted of samples we have.
    hasPad = (NSamples - p) > 0;
    if hasPad > 0
        % Obtain the extended sequence length that is the next power two
        % near to 2p.
        NSamples = 2^(nextpow2(2*p-3));    
        
        %
        % Add the padding bits to reach the desired size that is a multiple
        % of two. We require a number of element, M, such that M >= 2p - 3.
        % In our case as we want to perform a FFT of a sequence that is a
        % power of two, we pick M = NextPow2(2p-3). To reach this size we
        % add padding bits, in the following way:
        %   - for a_data we insert NSamples - p null bits between the first
        %     element and the second element of the original vector.
        %   - for b_data  we cyclical repeat b_data to reach the desired
        %     size. To do so we first calculate how many time we can fit 
        %     b_data vector in the NSamples, given this we calcualate the remaining bits.
        %     This bits are the first n-th bit of b_data.
        % The result that the cyclic convolution produce have size
        % NextPow2(2p) and we are only consider the first p bits of it.
        % The example below explain the situation:
        %   - a_data_ext = [a_data(1), 0, 0, ..., a_data(2:p)]
        %   - b_data_ext = [b_data, b_data, ..., b_data(1:leftBits)]
        %
       
        
        % Extend the a_data vector
        a_input_ext = [a_data(1) zeros(1, NSamples - p) a_data(2:end)];
        
        % extend the b_data vector
        numDataFrame = floor(NSamples / p);
        leftBits = NSamples - numDataFrame*p;
        % Replicate the input signal numDataFrame times to add pad bits
        b_input_rep = repmat(b_data, 1, numDataFrame);
        b_input_ext = [b_input_rep b_data(1:leftBits)];
        
        % Some asserts to make sure we have not f***k the padding
        assert (leftBits > 0 && leftBits < p);
        assert(length(b_input_ext) == NSamples && length(a_input_ext) == NSamples);
        
        % Set the number of pad bits
        %numOfPadBits = max(NSamples - p, leftBits);
    else
        a_input_ext = a_data;
        b_input_ext = b_data;
        
        %numOfPadBits = 0;
    end 
       
    % If we add padding we need to recalculate the log2 samples based on
    % the new sample number.
    NSamplesLog2 = log2(NSamples);
    
    % Generate null imaginary part for both operator.
    im_null_data = zeros(1, NSamples);
    

    
    fft_config.C_NFFT_MAX = NSamplesLog2;
    fft_config.C_ARCH = 3;
    fft_config.C_HAS_NFFT = 0;
    fft_config.C_USE_FLT_PT = 0;
    fft_config.C_INPUT_WIDTH = InputWidth+extra_bits; % Must be 32 if C_USE_FLT_PT = 1
    fft_config.C_TWIDDLE_WIDTH = InputWidth+extra_bits; % Must be 24 or 25 if C_USE_FLT_PT = 1
    fft_config.C_HAS_BFP = 0; % Set to 0 if C_USE_FLT_PT = 1
    fft_config.C_HAS_SCALING = 1; % Set to 0 if C_USE_FLT_PT = 1
    fft_config.C_HAS_ROUNDING = 1; % Convergent rouding.
    
%% FFT    
    a_fft = MyXFFTScaled(fft_config, round_mode, a_input_ext, im_null_data, 1);
    b_fft = MyXFFTScaled(fft_config, round_mode, b_input_ext, im_null_data, 1);
   
%% Do the complex multiplication
    cNumOfBits = 2*InputWidth+1; 
    CMulOut = MyCMul(a_fft, b_fft, cNumOfBits, round_mode);

    CMulOutShift = CMulOut; % fftshift(CMulOut);
    
    assert(cNumOfBits < 35);
    ifft_config=fft_config;
    ifft_config.C_INPUT_WIDTH =  cNumOfBits;
    ifft_config.C_TWIDDLE_WIDTH = cNumOfBits;
    
%% Do the IFFT
    IFFT_Out = MyXFFTScaled(ifft_config, round_mode, real(CMulOutShift), imag(CMulOutShift), 0);

%% Now convert the IFFT outpout into a sequence of bits
    OutIntPadded = round(real(IFFT_Out*NSamples));
    OutPadded = mod(OutIntPadded, 2);
    
   % Now we can take  the first 1:p bits that in bot case, are the element
   % of the result vector.
   Out = OutPadded(1:p);
   OutInt = OutIntPadded(1:p);
end

