109_vectors.zig 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. // So far in Ziglings, we've seen how for loops can be used to
  2. // repeat calculations across an array in several ways.
  3. //
  4. // For loops are generally great for this kind of task, but
  5. // sometimes they don't fully utilize the capabilities of the
  6. // CPU.
  7. //
  8. // Most modern CPUs can execute instructions in which SEVERAL
  9. // calculations are performed WITHIN registers at the SAME TIME.
  10. // These are known as "single instruction, multiple data" (SIMD)
  11. // instructions. SIMD instructions can make code significantly
  12. // more performant.
  13. //
  14. // To see why, imagine we have a program in which we take the
  15. // square root of four (changing) f32 floats.
  16. //
  17. // A simple compiler would take the program and produce machine code
  18. // which calculates each square root sequentially. Most registers on
  19. // modern CPUs have 64 bits, so we could imagine that each float moves
  20. // into a 64-bit register, and the following happens four times:
  21. //
  22. // 32 bits 32 bits
  23. // +-------------------+
  24. // register | 0 | x |
  25. // +-------------------+
  26. //
  27. // |
  28. // [SQRT instruction]
  29. // V
  30. //
  31. // +-------------------+
  32. // | 0 | sqrt(x) |
  33. // +-------------------+
  34. //
  35. // Notice that half of the register contains blank data to which
  36. // nothing happened. What a waste! What if we were able to use
  37. // that space instead? This is the idea at the core of SIMD.
  38. //
  39. // Most modern CPUs contain specialized registers with at least 128 bits
  40. // for performing SIMD instructions. On a machine with 128-bit SIMD
  41. // registers, a smart compiler would probably NOT issue four sqrt
  42. // instructions as above, but instead pack the floats into a single
  43. // 128-bit register, then execute a single "packed" sqrt
  44. // instruction to do ALL the square root calculations at once.
  45. //
  46. // For example:
  47. //
  48. //
  49. // 32 bits 32 bits 32 bits 32 bits
  50. // +---------------------------------------+
  51. // register | 4.0 | 9.0 | 25.0 | 49.0 |
  52. // +---------------------------------------+
  53. //
  54. // |
  55. // [SIMD SQRT instruction]
  56. // V
  57. //
  58. // +---------------------------------------+
  59. // register | 2.0 | 3.0 | 5.0 | 7.0 |
  60. // +---------------------------------------+
  61. //
  62. // Pretty cool, right?
  63. //
  64. // Code with SIMD instructions is usually more performant than code
  65. // without SIMD instructions. Zig cares a lot about performance,
  66. // so it has built-in support for SIMD! It has a data structure that
  67. // directly supports SIMD instructions:
  68. //
  69. // +-----------+
  70. // | Vectors |
  71. // +-----------+
  72. //
  73. // Operations performed on vectors in Zig will be done in parallel using
  74. // SIMD instructions, whenever possible.
  75. //
  76. // Defining vectors in Zig is straightforwards. No library import is needed.
  77. const v1 = @Vector(3, i32){ 1, 10, 100 };
  78. const v2 = @Vector(3, f32){ 2.0, 3.0, 5.0 };
  79. // Vectors support the same builtin operators as their underlying base types.
  80. const v3 = v1 + v1; // { 2, 20, 200};
  81. const v4 = v2 * v2; // { 4.0, 9.0, 25.0};
  82. // Intrinsics that apply to base types usually extend to vectors.
  83. const v5: @Vector(3, f32) = @floatFromInt(v3); // { 2.0, 20.0, 200.0}
  84. const v6 = v4 - v5; // { 2.0, -11.0, -175.0}
  85. const v7 = @abs(v6); // { 2.0, 11.0, 175.0}
  86. // We can make constant vectors, and reduce vectors.
  87. const v8: @Vector(4, u8) = @splat(2); // { 2, 2, 2, 2}
  88. const v8_sum = @reduce(.Add, v8); // 8
  89. const v8_min = @reduce(.Min, v8); // 2
  90. // Fixed-length arrays can be automatically assigned to vectors (and vice-versa).
  91. const single_digit_primes = [4]i8{ 2, 3, 5, 7 };
  92. const prime_vector: @Vector(4, i8) = single_digit_primes;
  93. // Now let's use vectors to simplify and optimize some code!
  94. //
  95. // Ewa is writing a program in which they frequently want to compare
  96. // two lists of four f32s. Ewa expects the lists to be similar, and
  97. // wants to determine the largest pairwise difference between the lists.
  98. //
  99. // Ewa wrote the following function to figure this out.
  100. fn calcMaxPairwiseDiffOld(list1: [4]f32, list2: [4]f32) f32 {
  101. var max_diff: f32 = 0;
  102. for (list1, list2) |n1, n2| {
  103. const abs_diff = @abs(n1 - n2);
  104. if (abs_diff > max_diff) {
  105. max_diff = abs_diff;
  106. }
  107. }
  108. return max_diff;
  109. }
  110. // Ewa heard about vectors in Zig, and started writing a new vector
  111. // version of the function, but has got stuck!
  112. //
  113. // Help Ewa finish the vector version! The examples above should help.
  114. const Vec4 = @Vector(4, f32);
  115. fn calcMaxPairwiseDiffNew(a: Vec4, b: Vec4) f32 {
  116. const abs_diff_vec = ???;
  117. const max_diff = @reduce(???, abs_diff_vec);
  118. return max_diff;
  119. }
  120. // Quite the simplification! We could even write the function in one line
  121. // and it would still be readable.
  122. //
  123. // Since the entire function is now expressed in terms of vector operations,
  124. // the Zig compiler will easily be able to compile it down to machine code
  125. // which utilizes the all-powerful SIMD instructions and does a lot of the
  126. // computation in parallel.
  127. const std = @import("std");
  128. const print = std.debug.print;
  129. pub fn main() void {
  130. const l1 = [4]f32{ 3.141, 2.718, 0.577, 1.000 };
  131. const l2 = [4]f32{ 3.154, 2.707, 0.591, 0.993 };
  132. const mpd_old = calcMaxPairwiseDiffOld(l1, l2);
  133. const mpd_new = calcMaxPairwiseDiffNew(l1, l2);
  134. print("Max difference (old fn): {d: >5.3}\n", .{mpd_old});
  135. print("Max difference (new fn): {d: >5.3}\n", .{mpd_new});
  136. }