[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [f-cpu] registers
On Sun, Oct 06, 2002 at 05:47:32PM -0700, Mohamed Ali Kilani wrote:
[...]
> When you normalize operands, in which range do you keep the divisor?
> [-1/2,1/2]?
Since I deal with integers of different sizes, the answer is a little bit
more complex. First, I extend the operand by 1 bit (either 0 or a copy
of the sign bit, depending on signed/unsigned mode). Then, I shift the
operand left as much as possible without generating an overflow. That
is, an unsigned divisor will be in the range [2^(n-1);2^n[, where <n>
is the chunk size in bits.
> How about "Quotient Selection" mecanism?
> Just compare PR to 0 and -1/2?
I compare PR with 0, and if it's != 0, I compare the signs of the PR
and the divisor.
> How many bits do you use from the redundant representation to decide?
Four digits.
> Do you plan to add, let's say, the 4 MSB of the sum and carry
> representations of the PR and then compare it to -1/2 and 0? or use a
> 256x2 table/PLA to generate quotient digit?
I use an encoding that makes comparision with 0 rather cheap. The sign
bit is calculated by a small adder.
> The PLA solution looks more efficient timing wise vs "small adder + one
> gate stage for decoding". Playing some tricks with the encoding would
> reduce its size and so keep its "access" time reasonable in order to
> acheive: Quotient Selection + D/Not(D) Mux Slection + CSA stages for PR
> computation + miscellaneous logic < 6 gates.
I'm already in the 6 gate range. In fact, the decision logic (quotient
selection) isn't a problem at all - it's the SIMD capability that is
most timing critical.
> I am just curious how do you manage to keep the critical path for an
> iteration under 6G/10T. Honnestly, I am having trouble doing it. :)
I use several tricks:
- optimized encoding of the PR
- precomputed PR(n+1) for all cases
- I moved the pipeline register towards the middle of the SRT core
I'll attach a copy of my latest working version of the core.
--
Michael "Tired" Riepe <Michael.Riepe@stud.uni-hannover.de>
"All I wanna do is have a little fun before I die"
-- srt64.vhdl - SIMD SRT Divider Core
-- Copyright (C) 2001, 2002 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-- $Id$
library IEEE;
use IEEE.std_logic_1164.all;
use work.Bit_Manipulation.all;
package SRT64 is
-- 64-bit SIMD SRT
procedure SRT64 (U, V, B : in std_ulogic_vector(79 downto 0);
A : in std_ulogic_vector(63 downto 0);
Size : in std_ulogic_vector(2 downto 0);
U2, V2 : out std_ulogic_vector(79 downto 0);
A2 : out std_ulogic_vector(63 downto 0);
QP, QN : out std_ulogic_vector(7 downto 0));
end SRT64;
package body SRT64 is
-- decision logic
procedure decide (U, V, B : in std_ulogic_vector(9 downto 0);
P, D : out std_ulogic) is
variable C : std_ulogic;
begin
-- p = '1' when remainder is zero (approximately)
-- d=1
P := U(9) and U(8) and U(7) and U(6);
-- carry for most significant bit
-- d=2
C := V(8) or (U(8) and V(7)) or (U(8) and U(7) and V(6));
-- D = '1' when remainder/divisor signs differ (=> add)
-- d=3
D := (B(9) xor U(9)) xor C;
end decide;
-- SRT core
procedure slice (U, V, B : in std_ulogic_vector;
A : in std_ulogic_vector;
Ax : in std_ulogic;
P, D : in std_ulogic;
Ui, Vi, Xi0, Xi1 : in std_ulogic;
Chain : in std_ulogic;
U2, V2 : out std_ulogic_vector;
Uo, Vo, Xo0, Xo1 : out std_ulogic) is
constant L : natural := A'length;
alias aa : std_ulogic_vector(L-1 downto 0) is A;
variable uu, vv : std_ulogic_vector(L+1 downto 0);
variable r0, r1 : std_ulogic_vector(L+1 downto 0);
begin
-- pragma synthesis_off
assert U'length = L + 2;
assert V'length = L + 2;
assert B'length = L + 2;
assert U2'length = L + 2;
assert V2'length = L + 2;
-- pragma synthesis_on
-- left shift
-- d=0
Uo := U(L-1);
Vo := V(L-1);
uu := lshift(U, 1);
vv := lshift(V, 1);
-- d=2
if to_X01(Chain) = '1' then
uu(0) := Ui;
vv(0) := Vi;
else
uu(0) := not Ax; -- Ax xor '1'
vv(0) := Ax; -- Ax and '1'
end if;
-- CSA chaining outputs
-- d=3 (worst case)
Xo1 := vv(L-1) or (uu(L-1) and B(L-1));
Xo0 := vv(L-1) or (uu(L-1) and not B(L-1));
-- CSA core
-- d=6 (worst case)
if to_X01(P) = '1' then
-- remainder is zero => do nothing
null;
elsif to_X01(D) = '1' then
-- add B
r0 := uu xor B;
r1 := vv or (uu and B);
r1 := lshift(r1, 1, Xi1 and Chain);
uu := r0 xor r1;
vv := r0 and r1;
else
-- subtract B
r0 := uu xor not B;
r1 := vv or (uu and not B);
r1 := lshift(r1, 1, Xi0 or not Chain);
uu := r0 xor r1;
vv := r0 and r1;
end if;
-- outputs
-- d=6
U2 := uu;
V2 := vv;
end slice;
procedure SRT64 (U, V, B : in std_ulogic_vector(79 downto 0);
A : in std_ulogic_vector(63 downto 0);
Size : in std_ulogic_vector(2 downto 0);
U2, V2 : out std_ulogic_vector(79 downto 0);
A2 : out std_ulogic_vector(63 downto 0);
QP, QN : out std_ulogic_vector(7 downto 0)) is
variable ax, p, d, chain : std_ulogic_vector(7 downto 0);
variable cu, cv, ca, cx0, cx1 : std_ulogic_vector(8 downto 0);
variable aa : std_ulogic_vector(63 downto 0);
begin
-- decision logic for all chunks
for i in 0 to 7 loop
decide(U(10*i+9 downto 10*i),
V(10*i+9 downto 10*i),
B(10*i+9 downto 10*i),
p(i), d(i));
end loop;
-- SIMD result selector
if to_X01(Size(2)) = '1' then
d := (7 downto 0 => d(7));
p := (7 downto 0 => p(7));
elsif to_X01(Size(1)) = '1' then
d := (7 downto 4 => d(7), 3 downto 0 => d(3));
p := (7 downto 4 => p(7), 3 downto 0 => p(3));
elsif to_X01(Size(0)) = '1' then
d(6) := d(7); p(6) := p(7);
d(4) := d(5); p(4) := p(5);
d(2) := d(3); p(2) := p(3);
d(0) := d(1); p(0) := p(1);
end if;
-- result output
QP := d or p;
QN := d and not p;
-- chaining control vector
chain := (
7 => Size(0),
6 => Size(1),
5 => Size(0),
4 => Size(2),
3 => Size(0),
2 => Size(1),
1 => Size(0),
0 => '0'
);
-- SRT slices
cu(0) := '0';
cv(0) := '0';
ca(0) := '0';
cx0(0) := '0';
cx1(0) := '0';
for i in 0 to 7 loop -- range MUST be ascending!
aa(8*i+7 downto 8*i) :=
lshift(A(8*i+7 downto 8*i), 1, ca(i) and chain(i));
ca(i+1) := A(8*i+7);
end loop;
if to_X01(Size(2)) = '1' then
ax := (7 downto 0 => ca(8));
elsif to_X01(Size(1)) = '1' then
ax := (
7 downto 4 => ca(8),
3 downto 0 => ca(4)
);
elsif to_X01(Size(0)) = '1' then
ax := (
7 downto 6 => ca(8),
5 downto 4 => ca(6),
3 downto 2 => ca(4),
1 downto 0 => ca(2)
);
else
ax := ca(8 downto 1);
end if;
for i in 0 to 7 loop -- range MUST be ascending!
slice(
U(10*i+9 downto 10*i),
V(10*i+9 downto 10*i),
B(10*i+9 downto 10*i),
aa(8*i+7 downto 8*i),
ax(i),
p(i), d(i),
cu(i), cv(i), cx0(i), cx1(i),
chain(i),
U2(10*i+9 downto 10*i),
V2(10*i+9 downto 10*i),
cu(i+1), cv(i+1), cx0(i+1), cx1(i+1)
);
end loop;
A2 := aa;
end SRT64;
end SRT64;
-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please