|
@@ -97,43 +97,65 @@ void calibrate_delay_loop();
|
97
|
97
|
#define DELAY_US(x) DelayCycleFnc((x) * ((F_CPU) / 1000000UL))
|
98
|
98
|
|
99
|
99
|
#elif defined(__AVR__)
|
100
|
|
-
|
101
|
|
- #define nop() __asm__ __volatile__("nop;\n\t":::)
|
102
|
|
-
|
103
|
|
- FORCE_INLINE static void __delay_4cycles(uint8_t cy) {
|
104
|
|
- __asm__ __volatile__(
|
105
|
|
- L("1")
|
106
|
|
- A("dec %[cnt]")
|
107
|
|
- A("nop")
|
108
|
|
- A("brne 1b")
|
109
|
|
- : [cnt] "+r"(cy) // output: +r means input+output
|
110
|
|
- : // input:
|
111
|
|
- : "cc" // clobbers:
|
112
|
|
- );
|
|
100
|
+ FORCE_INLINE static void __delay_up_to_3c(uint8_t cycles) {
|
|
101
|
+ switch (cycles) {
|
|
102
|
+ case 3:
|
|
103
|
+ __asm__ __volatile__(A("RJMP .+0") A("NOP"));
|
|
104
|
+ break;
|
|
105
|
+ case 2:
|
|
106
|
+ __asm__ __volatile__(A("RJMP .+0"));
|
|
107
|
+ break;
|
|
108
|
+ case 1:
|
|
109
|
+ __asm__ __volatile__(A("NOP"));
|
|
110
|
+ break;
|
|
111
|
+ }
|
113
|
112
|
}
|
114
|
113
|
|
115
|
114
|
// Delay in cycles
|
116
|
|
- FORCE_INLINE static void DELAY_CYCLES(uint16_t x) {
|
117
|
|
-
|
118
|
|
- if (__builtin_constant_p(x)) {
|
119
|
|
- #define MAXNOPS 4
|
120
|
|
-
|
121
|
|
- if (x <= (MAXNOPS)) {
|
122
|
|
- switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); }
|
|
115
|
+ FORCE_INLINE static void DELAY_CYCLES(uint16_t cycles) {
|
|
116
|
+ if (__builtin_constant_p(cycles)) {
|
|
117
|
+ if (cycles <= 3) {
|
|
118
|
+ __delay_up_to_3c(cycles);
|
|
119
|
+ }
|
|
120
|
+ else if (cycles == 4) {
|
|
121
|
+ __delay_up_to_3c(2);
|
|
122
|
+ __delay_up_to_3c(2);
|
123
|
123
|
}
|
124
|
124
|
else {
|
125
|
|
- const uint32_t rem = (x) % (MAXNOPS);
|
126
|
|
- switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); }
|
127
|
|
- if ((x = (x) / (MAXNOPS)))
|
128
|
|
- __delay_4cycles(x); // if need more then 4 nop loop is more optimal
|
|
125
|
+ cycles -= 1 + 4; // Compensate for the first LDI (1) and the first round (4)
|
|
126
|
+ __delay_up_to_3c(cycles % 4);
|
|
127
|
+
|
|
128
|
+ cycles /= 4;
|
|
129
|
+ // The following code burns [1 + 4 * (rounds+1)] cycles
|
|
130
|
+ uint16_t dummy;
|
|
131
|
+ __asm__ __volatile__(
|
|
132
|
+ // "manually" load counter from constants, otherwise the compiler may optimize this part away
|
|
133
|
+ A("LDI %A[rounds], %[l]") // 1c
|
|
134
|
+ A("LDI %B[rounds], %[h]") // 1c (compensating the non branching BRCC)
|
|
135
|
+ L("1")
|
|
136
|
+ A("SBIW %[rounds], 1") // 2c
|
|
137
|
+ A("BRCC 1b") // 2c when branching, else 1c (end of loop)
|
|
138
|
+ : // Outputs ...
|
|
139
|
+ [rounds] "=w" (dummy) // Restrict to a wo (=) 16 bit register pair (w)
|
|
140
|
+ : // Inputs ...
|
|
141
|
+ [l] "M" (cycles%256), // Restrict to 0..255 constant (M)
|
|
142
|
+ [h] "M" (cycles/256) // Restrict to 0..255 constant (M)
|
|
143
|
+ :// Clobbers ...
|
|
144
|
+ "cc" // Indicate we are modifying flags like Carry (cc)
|
|
145
|
+ );
|
129
|
146
|
}
|
130
|
|
-
|
131
|
|
- #undef MAXNOPS
|
132
|
147
|
}
|
133
|
|
- else if ((x >>= 2))
|
134
|
|
- __delay_4cycles(x);
|
|
148
|
+ else {
|
|
149
|
+ __asm__ __volatile__(
|
|
150
|
+ L("1")
|
|
151
|
+ A("SBIW %[cycles], 4") // 2c
|
|
152
|
+ A("BRCC 1b") // 2c when branching, else 1c (end of loop)
|
|
153
|
+ : [cycles] "+w" (cycles) // output: Restrict to a rw (+) 16 bit register pair (w)
|
|
154
|
+ : // input: -
|
|
155
|
+ : "cc" // clobbers: We are modifying flags like Carry (cc)
|
|
156
|
+ );
|
|
157
|
+ }
|
135
|
158
|
}
|
136
|
|
- #undef nop
|
137
|
159
|
|
138
|
160
|
// Delay in microseconds
|
139
|
161
|
#define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
|