archlab - Katyusha's blog

Archlab#

Part A Y86-64程序#

规则：给定你一段C语言代码，需要使用 $Y86-64$ 汇编代码写出与其函数上等价的代码

由于 $Y86-64$ 功能有限，你需要将测试的输入也写在代码中

目录下使用 ./yas test.ys 编译 ./yis test.yo得到运行的结果

要求对于一个链表进行元素求和，其中C代码如下

1
/* $begin examples */
2
/* linked list element */
3
typedef struct ELE {
4
    long val;
5
    struct ELE *next;
6
} *list_ptr;
7

8
/* sum_list - Sum the elements of a linked list */
9
long sum_list(list_ptr ls)
10
{
11
    long val = 0;
12
    while (ls) {
13
  val += ls->val;
14
  ls = ls->next;
15
    }
16
    return val;
17
}
18

19
/* rsum_list - Recursive version of sum_list */
20
long rsum_list(list_ptr ls)
21
{
22
    if (!ls)
23
  return 0;
24
    else {
25
  long val = ls->val;
26
  long rest = rsum_list(ls->next);
27
  return val + rest;
28
    }
29
}

比较简单，直接实现即可

1
##################################################################
2
#initialization
3

4
.pos 0
5
irmovq stack, %rsp
6
call main
7
halt
8

9

10
#sample linked list
11

12
.align 8
13
ele1:
14
    .quad 0x00a
15
    .quad ele2
16
ele2:
17
    .quad 0x0b0
18
    .quad ele3
19
ele3:
20
    .quad 0xc00
21
    .quad 0
22

23
#main function
24

25
main:
26
    irmovq ele1, %rdi
27
    call sum_list
28
    ret
29

30
#sum_list function
31

32
sum_list:
33
    irmovq $0, %rax
34
    jmp test
35

36
loop:
37
    mrmovq (%rdi), %rsi
38
    addq %rsi, %rax
39
    mrmovq 8(%rdi), %rdi
40

41
test:
42
    andq %rdi, %rdi
43
    jne loop
44
    ret
45

46
#set initial adress of %rsp
47

48
    .pos 0x200
49
stack:

最后需要多打一个换行才能过编译，我也不知道为什么（

1
##################################################################
2
#initialization
3

4
.pos 0
5
irmovq stack, %rsp
6
call main
7
halt
8

9

10
#sample linked list
11

12
.align 8
13
ele1:
14
    .quad 0x00a
15
    .quad ele2
16
ele2:
17
    .quad 0x0b0
18
    .quad ele3
19
ele3:
20
    .quad 0xc00
21
    .quad 0
22

23
#main function
24
main:
25
    irmovq ele1, %rdi
26
    irmovq $0, %rax
27
    call rsum_list
28
    ret
29

30
#recursively calculate the sum of a list
31

32
rsum_list:
33
    pushq %rbp
34
    andq %rdi, %rdi
35
    je return
36
    mrmovq (%rdi), %rbp
37
    addq %rbp, %rax
38
    mrmovq 8(%rdi), %rdi
39
    call rsum_list
40
return:
41
    popq %rbp
42
    ret
43

44
#set initial adress of %rsp
45

46
    .pos 0x200
47
stack:

注意递归结束的时候，需要恢复被调用者保存寄存器的原始值

1
/* copy_block - Copy src to dest and return xor checksum of src */
2
long copy_block(long *src, long *dest, long len)
3
{
4
    long result = 0;
5
    while (len > 0) {
6
  long val = *src++;
7
  *dest++ = val;
8
  result ^= val;
9
  len--;
10
    }
11
    return result;
12
}
13
/* $end examples */

1
##################################################################
2
#initialization
3

4
.pos 0
5
irmovq stack, %rsp
6
call main
7
halt
8

9
#sample
10

11
.align 8
12
# Source block
13
src:
14
    .quad 0x00a
15
    .quad 0x0b0
16
    .quad 0xc00
17
# Destination block
18
dest:
19
    .quad 0x111
20
    .quad 0x222
21
    .quad 0x333
22

23
#main function
24

25
main:
26
    irmovq src, %rdi
27
    irmovq dest, %rsi
28
    irmovq $3, %rdx
29
    irmovq $0, %rax
30
    irmovq $8, %rcx
31
    irmovq $1, %r8
32
    call copy_block
33
    ret
34

35
#copy function
36

37
copy_block:
38
    pushq %rbx
39
test:
40
    andq %rdx, %rdx
41
    je return
42
loop:
43
    mrmovq (%rdi), %rbx
44
    xorq %rbx, %rax
45
    rmmovq %rbx, (%rsi)
46
    addq %rcx, %rdi
47
    addq %rcx, %rsi
48
    subq %r8, %rdx
49
    jmp test
50
return:
51
    popq %rbx
52
    ret
53

54
    .pos 0x200
55
stack:

Part B `iaddq`的实现#

给定你SEQ的实现，要求你补充iaddq指令(即将寄存器加上一个立即数)的实现

按照SEQ的步骤，一步一步判断每个相关信号的值就行

shell中使用以下指令进行测试

1
make  VERSION=full
2
./ssim -t ../y86-code/asumi.yo
3
cd ../y86-code; make testssim

1
#/* $begin seq-all-hcl */
2
####################################################################
3
#  HCL Description of Control for Single Cycle Y86-64 Processor SEQ   #
4
#  Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2010       #
5
####################################################################
6

7
## Your task is to implement the iaddq instruction
8
## The file contains a declaration of the icodes
9
## for iaddq (IIADDQ)
10
## Your job is to add the rest of the logic to make it work
11

12
####################################################################
13
#    C Include's.  Don't alter these                               #
14
####################################################################
15

16
quote '#include <stdio.h>'
17
quote '#include "isa.h"'
18
quote '#include "sim.h"'
19
quote 'int sim_main(int argc, char *argv[]);'
20
quote 'word_t gen_pc(){return 0;}'
21
quote 'int main(int argc, char *argv[])'
22
quote '  {plusmode=0;return sim_main(argc,argv);}'
23

24
####################################################################
25
#    Declarations.  Do not change/remove/delete any of these       #
26
####################################################################
27

28
##### Symbolic representation of Y86-64 Instruction Codes #############
29
wordsig INOP   'I_NOP'
30
wordsig IHALT  'I_HALT'
31
wordsig IRRMOVQ  'I_RRMOVQ'
32
wordsig IIRMOVQ  'I_IRMOVQ'
33
wordsig IRMMOVQ  'I_RMMOVQ'
34
wordsig IMRMOVQ  'I_MRMOVQ'
35
wordsig IOPQ  'I_ALU'
36
wordsig IJXX  'I_JMP'
37
wordsig ICALL  'I_CALL'
38
wordsig IRET  'I_RET'
39
wordsig IPUSHQ  'I_PUSHQ'
40
wordsig IPOPQ  'I_POPQ'
41
# Instruction code for iaddq instruction
42
wordsig IIADDQ  'I_IADDQ'
43

44
##### Symbolic represenations of Y86-64 function codes                  #####
45
wordsig FNONE    'F_NONE'        # Default function code
46

47
##### Symbolic representation of Y86-64 Registers referenced explicitly #####
48
wordsig RRSP     'REG_RSP'      # Stack Pointer
49
wordsig RNONE    'REG_NONE'     # Special value indicating "no register"
50

51
##### ALU Functions referenced explicitly                            #####
52
wordsig ALUADD  'A_ADD'    # ALU should add its arguments
53

54
##### Possible instruction status values                             #####
55
wordsig SAOK  'STAT_AOK'  # Normal execution
56
wordsig SADR  'STAT_ADR'  # Invalid memory address
57
wordsig SINS  'STAT_INS'  # Invalid instruction
58
wordsig SHLT  'STAT_HLT'  # Halt instruction encountered
59

60
##### Signals that can be referenced by control logic ####################
61

62
##### Fetch stage inputs    #####
63
wordsig pc 'pc'        # Program counter
64
##### Fetch stage computations    #####
65
wordsig imem_icode 'imem_icode'    # icode field from instruction memory
66
wordsig imem_ifun  'imem_ifun'     # ifun field from instruction memory
67
wordsig icode    'icode'    # Instruction control code
68
wordsig ifun    'ifun'    # Instruction function
69
wordsig rA    'ra'      # rA field from instruction
70
wordsig rB    'rb'      # rB field from instruction
71
wordsig valC    'valc'    # Constant from instruction
72
wordsig valP    'valp'    # Address of following instruction
73
boolsig imem_error 'imem_error'    # Error signal from instruction memory
74
boolsig instr_valid 'instr_valid'  # Is fetched instruction valid?
75

76
##### Decode stage computations    #####
77
wordsig valA  'vala'      # Value from register A port
78
wordsig valB  'valb'      # Value from register B port
79

80
##### Execute stage computations  #####
81
wordsig valE  'vale'      # Value computed by ALU
82
boolsig Cnd  'cond'      # Branch test
83

84
##### Memory stage computations    #####
85
wordsig valM  'valm'      # Value read from memory
86
boolsig dmem_error 'dmem_error'    # Error signal from data memory
87

88

89
####################################################################
90
#    Control Signal Definitions.                                   #
91
####################################################################
92

93
################ Fetch Stage     ###################################
94

95
# Determine instruction code
96
word icode = [
97
  imem_error: INOP;
98
  1: imem_icode;    # Default: get from instruction memory
99
];
100

101
# Determine instruction function
102
word ifun = [
103
  imem_error: FNONE;
104
  1: imem_ifun;    # Default: get from instruction memory
105
];
106

107
bool instr_valid = icode in
108
  { INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
109
         IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ};
110

111
# Does fetched instruction require a regid byte?
112
bool need_regids =
113
  icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
114
         IIRMOVQ, IRMMOVQ, IMRMOVQ , IIADDQ};
115

116
# Does fetched instruction require a constant word?
117
bool need_valC =
118
  icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL , IIADDQ};
119

120
################ Decode Stage    ###################################
121

122
## What register should be used as the A source?
123
word srcA = [
124
  icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ  } : rA;
125
  icode in { IPOPQ, IRET } : RRSP;
126
  1 : RNONE; # Don't need register
127
];
128

129
## What register should be used as the B source?
130
word srcB = [
131
  icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ  } : rB;
132
  icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
133
  1 : RNONE;  # Don't need register
134
];
135

136
## What register should be used as the E destination?
137
word dstE = [
138
  icode in { IRRMOVQ } && Cnd : rB;
139
  icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
140
  icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
141
  1 : RNONE;  # Don't write any register
142
];
143

144
## What register should be used as the M destination?
145
word dstM = [
146
  icode in { IMRMOVQ, IPOPQ } : rA;
147
  1 : RNONE;  # Don't write any register
148
];
149

150
################ Execute Stage   ###################################
151

152
## Select input A to ALU
153
word aluA = [
154
  icode in { IRRMOVQ, IOPQ } : valA;
155
  icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ , IIADDQ} : valC;
156
  icode in { ICALL, IPUSHQ } : -8;
157
  icode in { IRET, IPOPQ } : 8;
158
  # Other instructions don't need ALU
159
];
160

161
## Select input B to ALU
162
word aluB = [
163
  icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
164
          IPUSHQ, IRET, IPOPQ , IIADDQ} : valB;
165
  icode in { IRRMOVQ, IIRMOVQ } : 0;
166
  # Other instructions don't need ALU
167
];
168

169
## Set the ALU function
170
word alufun = [
171
  icode == IOPQ : ifun;
172
  1 : ALUADD;
173
];
174

175
## Should the condition codes be updated?
176
bool set_cc = icode in { IOPQ , IIADDQ};
177

178
################ Memory Stage    ###################################
179

180
## Set read control signal
181
bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET };
182

183
## Set write control signal
184
bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL };
185

186
## Select memory address
187
word mem_addr = [
188
  icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE;
189
  icode in { IPOPQ, IRET } : valA;
190
  # Other instructions don't need address
191
];
192

193
## Select memory input data
194
word mem_data = [
195
  # Value from register
196
  icode in { IRMMOVQ, IPUSHQ } : valA;
197
  # Return PC
198
  icode == ICALL : valP;
199
  # Default: Don't write anything
200
];
201

202
## Determine instruction status
203
word Stat = [
204
  imem_error || dmem_error : SADR;
205
  !instr_valid: SINS;
206
  icode == IHALT : SHLT;
207
  1 : SAOK;
208
];
209

210
################ Program Counter Update ############################
211

212
## What address should instruction be fetched at
213

214
word new_pc = [
215
  # Call.  Use instruction constant
216
  icode == ICALL : valC;
217
  # Taken branch.  Use instruction constant
218
  icode == IJXX && Cnd : valC;
219
  # Completion of RET instruction.  Use value from stack
220
  icode == IRET : valM;
221
  # Default: Use incremented PC
222
  1 : valP;
223
];
224
#/* $end seq-all-hcl */

Part C Y86-64程序性能优化#

写在前面#

这个lab给定了你流水线化的控制代码以及一段代码的C和Y86-64实现，要求你优化流水线的控制代码和汇编代码，提高其运行效率

1
make  VERSION=full
2
cd ../ptest; make SIM=../pipe/psim
3
cd ../ptest; make SIM=../pipe/psim TFLAGS=-i

可以对修改后的处理器进行测试

../misc/yas ncopy.ys && ./check-len.pl < ncopy.yo 检查汇编代码是否超过1000Byte的限制

./correctness.pl检查汇编代码的正确性

make drivers && ./benchmark.pl进行本地跑分，计算CPE

得分细则如下

C代码如下，函数的功能是将src开始的len个元素全部拷贝到dst对应地址中，并返回源数据中正数个数

1
/* $begin ncopy */
2
/*
3
 * ncopy - copy src to dst, returning number of positive ints
4
 * contained in src array.
5
 */
6
word_t ncopy(word_t *src, word_t *dst, word_t len)
7
{
8
    word_t count = 0;
9
    word_t val;
10

11
    while (len > 0) {
12
  val = *src++;
13
  *dst++ = val;
14
  if (val > 0)
15
      count++;
16
  len--;
17
    }
18
    return count;
19
}
20
/* $end ncopy */

Y86-64汇编原始代码

1
##################################################################
2
# %rdi = src, %rsi = dst, %rdx = len
3
# You can modify this portion
4
  # Loop header
5
  xorq %rax,%rax    # count = 0;
6
  andq %rdx,%rdx    # len <= 0?
7
  jle Done    # if so, goto Done:
8

9
Loop:
10
    mrmovq (%rdi), %r10  # read val from src...
11
  rmmovq %r10, (%rsi)  # ...and store it to dst
12
  andq %r10, %r10    # val <= 0?
13
  jle Npos    # if so, goto Npos:
14
  irmovq $1, %r10
15
  addq %r10, %rax    # count++
16
Npos:
17
  irmovq $1, %r10
18
  subq %r10, %rdx    # len--
19
  irmovq $8, %r10
20
  addq %r10, %rdi    # src++
21
  addq %r10, %rsi    # dst++
22
  andq %rdx,%rdx    # len > 0?
23
  jg Loop      # if so, goto Loop:

跑分结果如下：

Average CPE 15.18 Score 0.0/60.0

你都交原始代码了还想得分？(

使用iaddq指令#

我们用Part B中相同的步骤，修改 pipe-full.hcl引入iaddq指令，减少向寄存器反复写入常数的开销

1
##################################################################
2
# You can modify this portion
3
  # Loop header
4
  xorq %rax,%rax    # count = 0;
5
  andq %rdx,%rdx    # len <= 0?
6
  jle Done    # if so, goto Done:
7

8
Loop:
9
    mrmovq (%rdi), %r10  # read val from src...
10
  rmmovq %r10, (%rsi)  # ...and store it to dst
11
  andq %r10, %r10    # val <= 0?
12
  jle Npos    # if so, goto Npos:
13
    iaddq $1, %rax      # count++
14
Npos:
15
    iaddq $-1, %rdx     # len--
16
  irmovq $8, %r10
17
  iaddq $8, %rdi    # src++
18
  iaddq $8, %rsi    # dst++
19
  andq %rdx,%rdx    # len > 0?
20
  jg Loop      # if so, goto Loop:

Average CPE 13.70 Score 0.0/60.0

优化后结果如下，性能略有提升，但仍然是0分

循环展开以及用条件传送替换条件跳转#

考虑对源代码进行 $8 \times 8$ 的循环展开，同时使用不同的寄存器存储拷贝的值，减少循环判断和数据相关性

同时，对于统计正数这一部分，我们可以简单地使用条件传送而非条件跳转，避免分支预测错误带来的巨大性能损失

1
##################################################################
2
  xorq %rax,%rax    # count = 0;
3
  andq %rdx,%rdx    # len <= 0?
4
  jle Done    # if so, goto Done:
5
    irmovq $1, %r8  #store const 1 in %r8
6

7
Judge:
8
    iaddq $-8, %rdx
9
    jl Endloop
10

11
Loop:
12
    mrmovq (%rdi), %rbx
13
    mrmovq 8(%rdi), %rbp
14
    mrmovq 16(%rdi), %r9
15
    mrmovq 24(%rdi), %r10
16
    mrmovq 32(%rdi), %r11
17
    mrmovq 40(%rdi), %r12
18
    mrmovq 48(%rdi), %r13
19
    mrmovq 56(%rdi), %r14
20

21
    irmovq $0, %rcx
22
    andq %rbx, %rbx
23
    cmovg %r8, %rcx
24
    addq %rcx, %rax
25

26
    irmovq $0, %rcx
27
    andq %rbp, %rbp
28
    cmovg %r8, %rcx
29
    addq %rcx, %rax
30

31
    irmovq $0, %rcx
32
    andq %r9, %r9
33
    cmovg %r8, %rcx
34
    addq %rcx, %rax
35

36
    irmovq $0, %rcx
37
    andq %r10, %r10
38
    cmovg %r8, %rcx
39
    addq %rcx, %rax
40

41
    irmovq $0, %rcx
42
    andq %r11, %r11
43
    cmovg %r8, %rcx
44
    addq %rcx, %rax
45

46
    irmovq $0, %rcx
47
    andq %r12, %r12
48
    cmovg %r8, %rcx
49
    addq %rcx, %rax
50

51
    irmovq $0, %rcx
52
    andq %r13, %r13
53
    cmovg %r8, %rcx
54
    addq %rcx, %rax
55

56
    irmovq $0, %rcx
57
    andq %r14, %r14
58
    cmovg %r8, %rcx
59
    addq %rcx, %rax
60

61
    rmmovq %rbx, (%rsi)
62
    rmmovq %rbp, 8(%rsi)
63
    rmmovq %r9, 16(%rsi)
64
    rmmovq %r10, 24(%rsi)
65
    rmmovq %r11, 32(%rsi)
66
    rmmovq %r12, 40(%rsi)
67
    rmmovq %r13, 48(%rsi)
68
    rmmovq %r14, 56(%rsi)
69

70
  iaddq $64, %rdi    # src+=8
71
  iaddq $64, %rsi    # dst+=8
72
  jmp Judge
73

74
Endloop:
75
    iaddq $8, %rdx
76

77
Judge2:
78
    andq %rdx, %rdx
79
    jle Done
80
Loop2:
81
    mrmovq (%rdi), %rbx
82
    irmovq $0, %rcx
83
    andq %rbx, %rbx
84
    cmovg %r8, %rcx
85
    addq %rcx, %rax
86
    rmmovq %rbx, (%rsi)
87
    iaddq $8, %rdi
88
    iaddq $8, %rsi
89
    iaddq $-1, %rdx
90
    jmp Judge2

CPE以及得分如下

Average CPE 9.98 Score 10.5/60.0

我们发现，在拷贝的数量比较少的时候，CPE的值相当大，甚至比最原始的汇编代码性能还要差，同时，在注释掉对于剩下 $len \%8$ 个数的拷贝与统计的时候，CPE下降到了6.79，足以得到满分，说明该程序性能的瓶颈在于对余数的处理

对于余数的处理#

考虑对于最后8个数不采用循环，直接类似循环展开依次拷贝并统计

1
##################################################################
2
Endloop:
3
    iaddq $8, %rdx
4
    jle Done
5

6
    mrmovq (%rdi), %rbx
7
    irmovq $0, %rcx
8
    andq %rbx, %rbx
9
    cmovg %r8, %rcx
10
    addq %rcx, %rax
11
    iaddq $-1, %rdx
12
    rmmovq %rbx, (%rsi)
13
    jle Done
14

15
    mrmovq 8(%rdi), %rbp
16
    irmovq $0, %rcx
17
    andq %rbp, %rbp
18
    cmovg %r8, %rcx
19
    addq %rcx, %rax
20
    iaddq $-1, %rdx
21
    rmmovq %rbp, 8(%rsi)
22
    jle Done
23

24
    mrmovq 16(%rdi), %r9
25
    irmovq $0, %rcx
26
    andq %r9, %r9
27
    cmovg %r8, %rcx
28
    addq %rcx, %rax
29
    iaddq $-1, %rdx
30
    rmmovq %r9, 16(%rsi)
31
    jle Done
32

33
    mrmovq 24(%rdi), %r10
34
    irmovq $0, %rcx
35
    andq %r10, %r10
36
    cmovg %r8, %rcx
37
    addq %rcx, %rax
38
    iaddq $-1, %rdx
39
    rmmovq %r10, 24(%rsi)
40
    jle Done
41

42
    mrmovq 32(%rdi), %r11
43
    irmovq $0, %rcx
44
    andq %r11, %r11
45
    cmovg %r8, %rcx
46
    addq %rcx, %rax
47
    iaddq $-1, %rdx
48
    rmmovq %r11, 32(%rsi)
49
    jle Done
50

51
    mrmovq 40(%rdi), %r12
52
    irmovq $0, %rcx
53
    andq %r12, %r12
54
    cmovg %r8, %rcx
55
    addq %rcx, %rax
56
    iaddq $-1, %rdx
57
    rmmovq %r12, 40(%rsi)
58
    jle Done
59

60
    mrmovq 48(%rdi), %r13
61
    irmovq $0, %rcx
62
    andq %r13, %r13
63
    cmovg %r8, %rcx
64
    addq %rcx, %rax
65
    iaddq $-1, %rdx
66
    rmmovq %r13, 48(%rsi)
67
    jle Done
68

69
    mrmovq 56(%rdi), %r14
70
    irmovq $0, %rcx
71
    andq %r14, %r14
72
    cmovg %r8, %rcx
73
    addq %rcx, %rax
74
    iaddq $-1, %rdx
75
    rmmovq %r14, 56(%rsi)

Average CPE 9.04 Score 29.3/60.0

性能略有提升

再考虑到该处理器对于分支的预测是预测进入，而对于后八个数进入 Done的可能性更小，可能会导致分支预测出错导致性能下降，所以我们将跳转改为更可能的进入下一个数的处理

1
##################################################################
2
Endloop:
3
    iaddq $8, %rdx
4
    jle Done
5

6
    mrmovq (%rdi), %rbx
7
    irmovq $0, %rcx
8
    andq %rbx, %rbx
9
    cmovg %r8, %rcx
10
    addq %rcx, %rax
11
    iaddq $-1, %rdx
12
    rmmovq %rbx, (%rsi)
13
    jg calc2
14
    jmp Done
15

16
calc2:
17
    mrmovq 8(%rdi), %rbp
18
    irmovq $0, %rcx
19
    andq %rbp, %rbp
20
    cmovg %r8, %rcx
21
    addq %rcx, %rax
22
    iaddq $-1, %rdx
23
    rmmovq %rbp, 8(%rsi)
24
    jg calc3
25
    jmp Done
26

27
calc3:
28
    mrmovq 16(%rdi), %r9
29
    irmovq $0, %rcx
30
    andq %r9, %r9
31
    cmovg %r8, %rcx
32
    addq %rcx, %rax
33
    iaddq $-1, %rdx
34
    rmmovq %r9, 16(%rsi)
35
    jg calc4
36
    jmp Done
37

38
calc4:
39
    mrmovq 24(%rdi), %r10
40
    irmovq $0, %rcx
41
    andq %r10, %r10
42
    cmovg %r8, %rcx
43
    addq %rcx, %rax
44
    iaddq $-1, %rdx
45
    rmmovq %r10, 24(%rsi)
46
    jg calc5
47
    jmp Done
48

49
calc5:
50
    mrmovq 32(%rdi), %r11
51
    irmovq $0, %rcx
52
    andq %r11, %r11
53
    cmovg %r8, %rcx
54
    addq %rcx, %rax
55
    iaddq $-1, %rdx
56
    rmmovq %r11, 32(%rsi)
57
    jg calc6
58
    jmp Done
59

60
calc6:
61
    mrmovq 40(%rdi), %r12
62
    irmovq $0, %rcx
63
    andq %r12, %r12
64
    cmovg %r8, %rcx
65
    addq %rcx, %rax
66
    iaddq $-1, %rdx
67
    rmmovq %r12, 40(%rsi)
68
    jg calc7
69
    jmp Done
70

71
calc7:
72
    mrmovq 48(%rdi), %r13
73
    irmovq $0, %rcx
74
    andq %r13, %r13
75
    cmovg %r8, %rcx
76
    addq %rcx, %rax
77
    iaddq $-1, %rdx
78
    rmmovq %r13, 48(%rsi)
79
    jg calc8
80
    jmp Done
81

82
calc8:
83
    mrmovq 56(%rdi), %r14
84
    irmovq $0, %rcx
85
    andq %r14, %r14
86
    cmovg %r8, %rcx
87
    addq %rcx, %rax
88
    iaddq $-1, %rdx
89
    rmmovq %r14, 56(%rsi)

Average CPE 8.92 Score 31.5/60.0

性能也有所提升

一些奇怪的优化#

发现从条件传送改回条件跳转效率反而增加了。。。可能是条件传送要求对 %rcx反复进行清零，传送，加法，相关性过高，操作数量也变多了，反而不如条件跳转（

1
##################################################################
2
    andq %rbx, %rbx
3
    jle Test2
4
    iaddq $1, %rax
5

6
Test2:
7
    andq %rbp, %rbp
8
    jle Test3
9
    iaddq $1, %rax

类似这样修改就行

Average CPE 8.50 Score 40.0/60.0

效率又提升了

然后发现实验驱动在进入ncopy时%rax的值初始为0，且测试数据中不存在负长度的情况，因此为了榨分可以省掉循环之前的初始化与检查（按一般调用约定，健壮写法仍应显式把返回值寄存器清零并处理len <= 0）

Average CPE 8.13 Score 47.4/60.0

我很难受，叫基米来#

现在代码的限制瓶颈是处理余数时由于不知道需要处理的个数，我们只能将数据从内存中加载到寄存器后就立即使用检查其是否大于0，这产生了数据依赖

到了这里已经燃尽了，尝试过将余数按照 $2 \times 2$ 循环展开效率反而下降了，想着提前将余数从内存放进寄存器来减少数据相关，但是会超过编码长度限制，是时候询问伟大的哈基米3.0pro了（

Gemini3.0pro告诉我，可以用类似二叉树的结构高效地处理余数，具体来说，可以先将余数按照0-3和4-7分为左右儿子，对于右儿子，可以直接加载0-3的数进入寄存器中，进而减小了数据依赖，每个节点继续向下分，对于大小为2的右儿子也可以直接加载左儿子寄存器

同时发现循环展开中专门为循环判断设计函数进行跳转是不必要的，可以直接将判断写在循环的末尾

循环结尾改为

1
##################################################################
2
Test9:
3
  iaddq $64, %rdi    # src+=8
4
  iaddq $64, %rsi    # dst+=8
5
    iaddq $-8, %rdx
6
  jge Loop

使用二叉树结构处理余数部分汇编代码如下

1
##################################################################
2
Endloop:
3
    # -8 <= %rdx <= -1
4
    iaddq $4, %rdx
5
    jge Four_to_Seven
6
    iaddq $4, %rdx
7
    jmp Zero_to_Three
8
Four_to_Seven:
9
    mrmovq (%rdi), %rbx
10
    mrmovq 8(%rdi), %rbp
11
    mrmovq 16(%rdi), %r9
12
    mrmovq 24(%rdi), %r10
13

14
    rmmovq %rbx, (%rsi)
15
    rmmovq %rbp, 8(%rsi)
16
    rmmovq %r9, 16(%rsi)
17
    rmmovq %r10, 24(%rsi)
18

19
    andq %rbx, %rbx
20
    jle Notadd1
21
    iaddq $1, %rax
22
Notadd1:
23
    andq %rbp, %rbp
24
    jle Notadd2
25
    iaddq $1, %rax
26
Notadd2:
27
    andq %r9, %r9
28
    jle Notadd3
29
    iaddq $1, %rax
30
Notadd3:
31
    andq %r10, %r10
32
    jle Notadd4
33
    iaddq $1, %rax
34
Notadd4:
35
    iaddq $32, %rdi
36
    iaddq $32, %rsi
37

38
Zero_to_Three:
39
    # 0 <= %rdx <= 3
40
    iaddq $-2, %rdx
41
    jge Two_to_Three
42
    iaddq $2, %rdx
43
    jmp Zero_to_One
44

45
Two_to_Three:
46
    mrmovq (%rdi), %rbx
47
    mrmovq 8(%rdi), %rbp
48
    andq %rbx, %rbx
49
    jle Notadd1_2
50
    iaddq $1, %rax
51
Notadd1_2:
52
    andq %rbp, %rbp
53
    jle Notadd2_2
54
    iaddq $1, %rax
55
Notadd2_2:
56
    rmmovq %rbx, (%rsi)
57
    rmmovq %rbp, 8(%rsi)
58
    iaddq $16, %rdi
59
    iaddq $16, %rsi
60

61
Zero_to_One:
62
    andq %rdx, %rdx
63
    je Done
64
    mrmovq (%rdi), %rbx
65
    rmmovq %rbx, (%rsi)
66
    andq %rbx, %rbx
67
    jle Done
68
    iaddq $1, %rax

Average CPE 7.90 Score 52.1/60.0

尝试在余数为2和3的时候进行特判，避免进入左子树

1
##################################################################
2
Two_to_Three:
3
    mrmovq (%rdi), %rbx
4
    mrmovq 8(%rdi), %rbp
5
    je Handle_2
6
    mrmovq 16(%rdi), %r9
7
    rmmovq %r9, 16(%rsi)
8
    andq %r9, %r9
9
    jle Handle_2
10
    iaddq $1, %rax
11
Handle_2:
12
    rmmovq %rbx, (%rsi)
13
    andq %rbx, %rbx
14
    jle Notadd1_2
15
    iaddq $1, %rax
16
Notadd1_2:
17
    rmmovq %rbp, 8(%rsi)
18
    andq %rbp, %rbp
19
    jle Done
20
    iaddq $1, %rax
21
    jmp Done

Average CPE 7.80 Score 54.0/60.0

同时我们发现在处理的长度特别小的时候，CPE相当大

ncopy 0 26 1 33 33.00 2 33 16.50 3 39 13.00 4 46 11.50 5 53 10.60

由于处理器的分支预测逻辑是预测进入，所以我们尝试修改为预测进入左子树

1
##################################################################
2
Endloop:
3
    # -8 <= %rdx <= -1
4
    iaddq $4, %rdx
5
    jl Pre_Zero_to_Three
6

7
Four_to_Seven:
8
    mrmovq (%rdi), %rbx
9
    mrmovq 8(%rdi), %rbp
10
    mrmovq 16(%rdi), %r9
11
    mrmovq 24(%rdi), %r10
12

13
    rmmovq %rbx, (%rsi)
14
    rmmovq %rbp, 8(%rsi)
15
    rmmovq %r9, 16(%rsi)
16
    rmmovq %r10, 24(%rsi)
17

18
    andq %rbx, %rbx
19
    jle Notadd1
20
    iaddq $1, %rax
21
Notadd1:
22
    andq %rbp, %rbp
23
    jle Notadd2
24
    iaddq $1, %rax
25
Notadd2:
26
    andq %r9, %r9
27
    jle Notadd3
28
    iaddq $1, %rax
29
Notadd3:
30
    andq %r10, %r10
31
    jle Notadd4
32
    iaddq $1, %rax
33
Notadd4:
34
    iaddq $32, %rdi
35
    iaddq $32, %rsi
36
    jmp Zero_to_Three
37

38
Pre_Zero_to_Three:
39
    iaddq $4, %rdx
40

41
Zero_to_Three:
42
    # 0 <= %rdx <= 3
43
    iaddq $-2, %rdx
44
    jl Zero_to_One
45

46
Two_to_Three:
47
    mrmovq (%rdi), %rbx
48
    mrmovq 8(%rdi), %rbp
49
    je Handle_2
50
    mrmovq 16(%rdi), %r9
51
    rmmovq %r9, 16(%rsi)
52
    andq %r9, %r9
53
    jle Handle_2
54
    iaddq $1, %rax
55
Handle_2:
56
    rmmovq %rbx, (%rsi)
57
    andq %rbx, %rbx
58
    jle Notadd1_2
59
    iaddq $1, %rax
60
Notadd1_2:
61
    rmmovq %rbp, 8(%rsi)
62
    andq %rbp, %rbp
63
    jle Done
64
    iaddq $1, %rax
65
    jmp Done
66

67
Zero_to_One:
68
    iaddq $2, %rdx
69
    je Done
70
    mrmovq (%rdi), %rbx
71
    rmmovq %rbx, (%rsi)
72
    andq %rbx, %rbx
73
    jle Done
74
    iaddq $1, %rax

Average CPE 7.65 Score 57.1/60.0

调到这里已经产生生理性不适了，再写下去就要堆成屎山了，后面的区域以后再来探索吧（（（

遗憾离场

1
#/* $begin ncopy-ys */
2
##################################################################
3
# ncopy.ys - Copy a src block of len words to dst.
4
# Return the number of positive words (>0) contained in src.
5
#
6
# Include your name and ID here.
7
#
8
# Describe how and why you modified the baseline code.
9
#
10
##################################################################
11
# Do not modify this portion
12
# Function prologue.
13
# %rdi = src, %rsi = dst, %rdx = len
14
ncopy:
15

16
##################################################################
17
# You can modify this portion
18
  # Loop header
19
  # count = 0;
20
  # len <= 0?
21
  # if so, goto Done:
22

23
    iaddq $-8, %rdx
24
    jl Endloop
25

26
Loop:
27
    mrmovq (%rdi), %rbx
28
    mrmovq 8(%rdi), %rbp
29
    mrmovq 16(%rdi), %r9
30
    mrmovq 24(%rdi), %r10
31
    mrmovq 32(%rdi), %r11
32
    mrmovq 40(%rdi), %r12
33
    mrmovq 48(%rdi), %r13
34
    mrmovq 56(%rdi), %r14
35

36
    rmmovq %rbx, (%rsi)
37
    rmmovq %rbp, 8(%rsi)
38
    rmmovq %r9, 16(%rsi)
39
    rmmovq %r10, 24(%rsi)
40
    rmmovq %r11, 32(%rsi)
41
    rmmovq %r12, 40(%rsi)
42
    rmmovq %r13, 48(%rsi)
43
    rmmovq %r14, 56(%rsi)
44

45
    andq %rbx, %rbx
46
    jle Test2
47
    iaddq $1, %rax
48

49
Test2:
50
    andq %rbp, %rbp
51
    jle Test3
52
    iaddq $1, %rax
53

54
Test3:
55
    andq %r9, %r9
56
    jle Test4
57
    iaddq $1, %rax
58

59
Test4:
60
    andq %r10, %r10
61
    jle Test5
62
    iaddq $1, %rax
63

64
Test5:
65
    andq %r11, %r11
66
    jle Test6
67
    iaddq $1, %rax
68

69
Test6:
70
    andq %r12, %r12
71
    jle Test7
72
    iaddq $1, %rax
73

74
Test7:
75
    andq %r13, %r13
76
    jle Test8
77
    iaddq $1, %rax
78

79
Test8:
80
    andq %r14, %r14
81
    jle Test9
82
    iaddq $1, %rax
83

84
Test9:
85
  iaddq $64, %rdi    # src+=8
86
  iaddq $64, %rsi    # dst+=8
87
    iaddq $-8, %rdx
88
  jge Loop
89

90
# handle remainder
91
Endloop:
92
    # -8 <= %rdx <= -1
93
    iaddq $4, %rdx
94
    jl Pre_Zero_to_Three
95

96
Four_to_Seven:
97
    mrmovq (%rdi), %rbx
98
    mrmovq 8(%rdi), %rbp
99
    mrmovq 16(%rdi), %r9
100
    mrmovq 24(%rdi), %r10
101

102
    rmmovq %rbx, (%rsi)
103
    rmmovq %rbp, 8(%rsi)
104
    rmmovq %r9, 16(%rsi)
105
    rmmovq %r10, 24(%rsi)
106

107
    andq %rbx, %rbx
108
    jle Notadd1
109
    iaddq $1, %rax
110
Notadd1:
111
    andq %rbp, %rbp
112
    jle Notadd2
113
    iaddq $1, %rax
114
Notadd2:
115
    andq %r9, %r9
116
    jle Notadd3
117
    iaddq $1, %rax
118
Notadd3:
119
    andq %r10, %r10
120
    jle Notadd4
121
    iaddq $1, %rax
122
Notadd4:
123
    iaddq $32, %rdi
124
    iaddq $32, %rsi
125
    jmp Zero_to_Three
126

127
Pre_Zero_to_Three:
128
    iaddq $4, %rdx
129

130
Zero_to_Three:
131
    # 0 <= %rdx <= 3
132
    iaddq $-2, %rdx
133
    jl Zero_to_One
134

135
Two_to_Three:
136
    mrmovq (%rdi), %rbx
137
    mrmovq 8(%rdi), %rbp
138
    je Handle_2
139
    mrmovq 16(%rdi), %r9
140
    rmmovq %r9, 16(%rsi)
141
    andq %r9, %r9
142
    jle Handle_2
143
    iaddq $1, %rax
144
Handle_2:
145
    rmmovq %rbx, (%rsi)
146
    andq %rbx, %rbx
147
    jle Notadd1_2
148
    iaddq $1, %rax
149
Notadd1_2:
150
    rmmovq %rbp, 8(%rsi)
151
    andq %rbp, %rbp
152
    jle Done
153
    iaddq $1, %rax
154
    jmp Done
155

156
Zero_to_One:
157
    iaddq $2, %rdx
158
    je Done
159
    mrmovq (%rdi), %rbx
160
    rmmovq %rbx, (%rsi)
161
    andq %rbx, %rbx
162
    jle Done
163
    iaddq $1, %rax
164

165
##################################################################
166
# Do not modify the following section of code
167
# Function epilogue.
168
Done:
169
  ret
170
##################################################################
171
# Keep the following label at the end of your function
172
End:
173
#/* $end ncopy-ys */

Katyusha's blog