rustc_span/
analyze_source_file.rs

1use super::*;
2
3#[cfg(test)]
4mod tests;
5
6/// Finds all newlines, multi-byte characters, and non-narrow characters in a
7/// SourceFile.
8///
9/// This function will use an SSE2 enhanced implementation if hardware support
10/// is detected at runtime.
11pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
12    let mut lines = vec![RelativeBytePos::from_u32(0)];
13    let mut multi_byte_chars = vec![];
14
15    // Calls the right implementation, depending on hardware support available.
16    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
17
18    // The code above optimistically registers a new line *after* each \n
19    // it encounters. If that point is already outside the source_file, remove
20    // it again.
21    if let Some(&last_line_start) = lines.last() {
22        let source_file_end = RelativeBytePos::from_usize(src.len());
23        assert!(source_file_end >= last_line_start);
24        if last_line_start == source_file_end {
25            lines.pop();
26        }
27    }
28
29    (lines, multi_byte_chars)
30}
31
32// cfg(bootstrap)
33macro_rules! cfg_select_dispatch {
34    ($($tokens:tt)*) => {
35        #[cfg(bootstrap)]
36        cfg_match! { $($tokens)* }
37
38        #[cfg(not(bootstrap))]
39        cfg_select! { $($tokens)* }
40    };
41}
42
43cfg_select_dispatch! {
44    any(target_arch = "x86", target_arch = "x86_64") => {
45        fn analyze_source_file_dispatch(
46            src: &str,
47            lines: &mut Vec<RelativeBytePos>,
48            multi_byte_chars: &mut Vec<MultiByteChar>,
49        ) {
50            if is_x86_feature_detected!("sse2") {
51                unsafe {
52                    analyze_source_file_sse2(src, lines, multi_byte_chars);
53                }
54            } else {
55                analyze_source_file_generic(
56                    src,
57                    src.len(),
58                    RelativeBytePos::from_u32(0),
59                    lines,
60                    multi_byte_chars,
61                );
62            }
63        }
64
65        /// Checks 16 byte chunks of text at a time. If the chunk contains
66        /// something other than printable ASCII characters and newlines, the
67        /// function falls back to the generic implementation. Otherwise it uses
68        /// SSE2 intrinsics to quickly find all newlines.
69        #[target_feature(enable = "sse2")]
70        unsafe fn analyze_source_file_sse2(
71            src: &str,
72            lines: &mut Vec<RelativeBytePos>,
73            multi_byte_chars: &mut Vec<MultiByteChar>,
74        ) {
75            #[cfg(target_arch = "x86")]
76            use std::arch::x86::*;
77            #[cfg(target_arch = "x86_64")]
78            use std::arch::x86_64::*;
79
80            const CHUNK_SIZE: usize = 16;
81
82            let (chunks, tail) = src.as_bytes().as_chunks::<CHUNK_SIZE>();
83
84            // This variable keeps track of where we should start decoding a
85            // chunk. If a multi-byte character spans across chunk boundaries,
86            // we need to skip that part in the next chunk because we already
87            // handled it.
88            let mut intra_chunk_offset = 0;
89
90            for (chunk_index, chunk) in chunks.iter().enumerate() {
91                // We don't know if the pointer is aligned to 16 bytes, so we
92                // use `loadu`, which supports unaligned loading.
93                let chunk = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const __m128i) };
94
95                // For character in the chunk, see if its byte value is < 0, which
96                // indicates that it's part of a UTF-8 char.
97                let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
98                // Create a bit mask from the comparison results.
99                let multibyte_mask = _mm_movemask_epi8(multibyte_test);
100
101                // If the bit mask is all zero, we only have ASCII chars here:
102                if multibyte_mask == 0 {
103                    assert!(intra_chunk_offset == 0);
104
105                    // Check for newlines in the chunk
106                    let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
107                    let mut newlines_mask = _mm_movemask_epi8(newlines_test);
108
109                    let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
110
111                    while newlines_mask != 0 {
112                        let index = newlines_mask.trailing_zeros();
113
114                        lines.push(RelativeBytePos(index) + output_offset);
115
116                        // Clear the bit, so we can find the next one.
117                        newlines_mask &= newlines_mask - 1;
118                    }
119                } else {
120                    // The slow path.
121                    // There are multibyte chars in here, fallback to generic decoding.
122                    let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
123                    intra_chunk_offset = analyze_source_file_generic(
124                        &src[scan_start..],
125                        CHUNK_SIZE - intra_chunk_offset,
126                        RelativeBytePos::from_usize(scan_start),
127                        lines,
128                        multi_byte_chars,
129                    );
130                }
131            }
132
133            // There might still be a tail left to analyze
134            let tail_start = src.len() - tail.len() + intra_chunk_offset;
135            if tail_start < src.len() {
136                analyze_source_file_generic(
137                    &src[tail_start..],
138                    src.len() - tail_start,
139                    RelativeBytePos::from_usize(tail_start),
140                    lines,
141                    multi_byte_chars,
142                );
143            }
144        }
145    }
146    _ => {
147        // The target (or compiler version) does not support SSE2 ...
148        fn analyze_source_file_dispatch(
149            src: &str,
150            lines: &mut Vec<RelativeBytePos>,
151            multi_byte_chars: &mut Vec<MultiByteChar>,
152        ) {
153            analyze_source_file_generic(
154                src,
155                src.len(),
156                RelativeBytePos::from_u32(0),
157                lines,
158                multi_byte_chars,
159            );
160        }
161    }
162}
163
164// `scan_len` determines the number of bytes in `src` to scan. Note that the
165// function can read past `scan_len` if a multi-byte character start within the
166// range but extends past it. The overflow is returned by the function.
167fn analyze_source_file_generic(
168    src: &str,
169    scan_len: usize,
170    output_offset: RelativeBytePos,
171    lines: &mut Vec<RelativeBytePos>,
172    multi_byte_chars: &mut Vec<MultiByteChar>,
173) -> usize {
174    assert!(src.len() >= scan_len);
175    let mut i = 0;
176    let src_bytes = src.as_bytes();
177
178    while i < scan_len {
179        let byte = unsafe {
180            // We verified that i < scan_len <= src.len()
181            *src_bytes.get_unchecked(i)
182        };
183
184        // How much to advance in order to get to the next UTF-8 char in the
185        // string.
186        let mut char_len = 1;
187
188        if byte == b'\n' {
189            let pos = RelativeBytePos::from_usize(i) + output_offset;
190            lines.push(pos + RelativeBytePos(1));
191        } else if byte >= 128 {
192            // This is the beginning of a multibyte char. Just decode to `char`.
193            let c = src[i..].chars().next().unwrap();
194            char_len = c.len_utf8();
195
196            let pos = RelativeBytePos::from_usize(i) + output_offset;
197            assert!((2..=4).contains(&char_len));
198            let mbc = MultiByteChar { pos, bytes: char_len as u8 };
199            multi_byte_chars.push(mbc);
200        }
201
202        i += char_len;
203    }
204
205    i - scan_len
206}