Sorting – Completely parallel approach

admin August 14, 2017 FPGA, Intermediate Leave a comment 4,676 Views

The first approach I attempted was a completely parallel approach as shown in the diagram below:

The advantage to this approach, if it meets timing, is that it will produce a result every clock cycle, it’s compact, easy to understand, and uses a reasonable amount of logic.

compare.sv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

module compare
#
(
parameter DWIDTH = 8
)
(
input clk,

input [DWIDTH-1:0] a_in,
input [DWIDTH-1:0] b_in,
input [DWIDTH-1:0] c_in,
input [DWIDTH-1:0] d_in,

output logic [DWIDTH-1:0] a_out,
output logic [DWIDTH-1:0] b_out,
output logic [DWIDTH-1:0] c_out,
output logic [DWIDTH-1:0] d_out
);

logic [DWIDTH-1:0] a_reg;
logic [DWIDTH-1:0] b_reg;
logic [DWIDTH-1:0] c_reg;
logic [DWIDTH-1:0] d_reg;
logic a_lt_b;
logic a_lt_c;
logic a_lt_d;
logic b_lt_c;
logic b_lt_d;
logic c_lt_d;

always_ff @(posedge clk) begin
a_reg <= a_in;
b_reg <= b_in;
c_reg <= c_in;
d_reg <= d_in;
end

assign a_lt_b = a_reg < b_reg;
assign a_lt_c = a_reg < c_reg;
assign a_lt_d = a_reg < d_reg;
assign b_lt_c = b_reg < c_reg;
assign b_lt_d = b_reg < d_reg;
assign c_lt_d = c_reg < d_reg;

always_ff @(posedge clk) begin /* verilator lint_off CASEINCOMPLETE */
unique case ({a_lt_b, a_lt_c, a_lt_d, b_lt_c, b_lt_d, c_lt_d})
// A first
6'b111111: begin
a_out <= a_reg;
b_out <= b_reg;
c_out <= c_reg;
d_out <= d_reg;
end
6'b111110: begin
a_out <= a_reg;
b_out <= b_reg;
c_out <= d_reg;
d_out <= c_reg;
end
6'b111011: begin
a_out <= a_reg;
b_out <= c_reg;
c_out <= b_reg;
d_out <= d_reg;
end
6'b111001: begin
a_out <= a_reg;
b_out <= c_reg;
c_out <= d_reg;
d_out <= b_reg;
end
6'b111100: begin
a_out <= a_reg;
b_out <= d_reg;
c_out <= b_reg;
d_out <= c_reg;
end
6'b111000: begin
a_out <= a_reg;
b_out <= d_reg;
c_out <= c_reg;
d_out <= b_reg;
end
// B first
6'b011111: begin
a_out <= b_reg;
b_out <= a_reg;
c_out <= c_reg;
d_out <= d_reg;
end
6'b011110: begin
a_out <= b_reg;
b_out <= a_reg;
c_out <= d_reg;
d_out <= c_reg;
end
6'b001111: begin
a_out <= b_reg;
b_out <= c_reg;
c_out <= a_reg;
d_out <= d_reg;
end
6'b000111: begin
a_out <= b_reg;
b_out <= c_reg;
c_out <= d_reg;
d_out <= a_reg;
end
6'b010110: begin
a_out <= b_reg;
b_out <= d_reg;
c_out <= a_reg;
d_out <= c_reg;
end
6'b000110: begin
a_out <= b_reg;
b_out <= d_reg;
c_out <= c_reg;
d_out <= a_reg;
end
// C first
6'b101011: begin
a_out <= c_reg;
b_out <= a_reg;
c_out <= b_reg;
d_out <= d_reg;
end
6'b101001: begin
a_out <= c_reg;
b_out <= a_reg;
c_out <= d_reg;
d_out <= b_reg;
end
6'b001011: begin
a_out <= c_reg;
b_out <= b_reg;
c_out <= a_reg;
d_out <= d_reg;
end
6'b000011: begin
a_out <= c_reg;
b_out <= b_reg;
c_out <= d_reg;
d_out <= a_reg;
end
6'b100001: begin
a_out <= c_reg;
b_out <= d_reg;
c_out <= a_reg;
d_out <= b_reg;
end
6'b000001: begin
a_out <= c_reg;
b_out <= d_reg;
c_out <= b_reg;
d_out <= a_reg;
end
// D first
6'b110100: begin
a_out <= d_reg;
b_out <= a_reg;
c_out <= b_reg;
d_out <= c_reg;
end
6'b110000: begin
a_out <= d_reg;
b_out <= a_reg;
c_out <= c_reg;
d_out <= b_reg;
end
6'b010100: begin
a_out <= d_reg;
b_out <= b_reg;
c_out <= a_reg;
d_out <= c_reg;
end
6'b000100: begin
a_out <= d_reg;
b_out <= b_reg;
c_out <= c_reg;
d_out <= a_reg;
end
6'b100000: begin
a_out <= d_reg;
b_out <= c_reg;
c_out <= a_reg;
d_out <= b_reg;
end
6'b000000: begin
a_out <= d_reg;
b_out <= c_reg;
c_out <= b_reg;
d_out <= a_reg;
end
endcase // unique case ({a_lt_b, a_lt_c, a_lt_d, b_lt_c, b_lt_d, c_lt_d})
end
endmodule

The real key to the design is to work out the muxing criteria. This was done using pen and paper in my case.

I’m still working through the systemC side of things for the testbench. The testbench is relatively simple. I have parameterized the data width so I can simply loop through all the values on the four inputs. Currently I am verifying by visual inspection that the outputs are properly sorted, however, I will soon be changing to check the sorting. Here is the CPP code. I encourage you to look at the repo for the entire design.

tb.cpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

#include "tb.h"

void tb::source() {
for (int i = 0; i < 8; i++) {
cout << i << endl;
for (int j = 0; j < 8; j++) {
for (int k = 0; k < 8; k++) {
for (int l = 0; l < 8; l++) {
a_in.write(i);
b_in.write(j);
c_in.write(k);
d_in.write(l);
wait();
}
}
}
}
wait();
wait();
wait();
wait();
sc_stop();
}

void tb::sink() {
int a, b, c, d;
while(1) {
a = a_out.read();
b = b_out.read();
c = c_out.read();
d = d_out.read();
wait();
cout << a << "\t" << b << "\t" << c << "\t" << d << endl;
}
}

tb.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

#include
SC_MODULE(tb)
{
sc_in clk;
sc_out a_in;
sc_out b_in;
sc_out c_in;
sc_out d_in;
sc_in a_out;
sc_in b_out;
sc_in c_out;
sc_in d_out;

void source();
void sink();

SC_CTOR(tb) {
SC_CTHREAD(source, clk.pos() );
SC_CTHREAD(sink, clk.pos() );
}
};

Results for a ZCU102 targeting 500Mhz:
clk > 500Mhz
LUT: 232
FF: 258

fmax 645Mhz
LUT: 232
FF: 256

SV Blog SystemVerilog Blog

Sorting – Completely parallel approach

Related Articles

About admin

Check Also

Sorting – Pipelined Even-Odd

Leave a Reply Cancel reply

Machine Learning

Hello World Revisited

Vivado for HiDPI displays under linux

Sorting – Pipelined Even-Odd