std\sys\args/windows.rs
1//! The Windows command line is just a string
2//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3//!
4//! This module implements the parsing necessary to turn that string into a list of arguments.
5
6#[cfg(test)]
7mod tests;
8
9pub use super::common::Args;
10use crate::ffi::{OsStr, OsString};
11use crate::num::NonZero;
12use crate::os::windows::prelude::*;
13use crate::path::{Path, PathBuf};
14use crate::sys::pal::os::current_exe;
15use crate::sys::pal::{ensure_no_nuls, fill_utf16_buf};
16use crate::sys::path::get_long_path;
17use crate::sys::{c, to_u16s};
18use crate::sys_common::AsInner;
19use crate::sys_common::wstr::WStrUnits;
20use crate::{io, iter, ptr};
21
22pub fn args() -> Args {
23 // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
24 // string so it's safe for `WStrUnits` to use.
25 unsafe {
26 let lp_cmd_line = c::GetCommandLineW();
27 let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
28 current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
29 });
30
31 Args::new(parsed_args_list)
32 }
33}
34
35/// Implements the Windows command-line argument parsing algorithm.
36///
37/// Microsoft's documentation for the Windows CLI argument format can be found at
38/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
39///
40/// A more in-depth explanation is here:
41/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
42///
43/// Windows includes a function to do command line parsing in shell32.dll.
44/// However, this is not used for two reasons:
45///
46/// 1. Linking with that DLL causes the process to be registered as a GUI application.
47/// GUI applications add a bunch of overhead, even if no windows are drawn. See
48/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
49///
50/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
51///
52/// This function was tested for equivalence to the C/C++ parsing rules using an
53/// extensive test suite available at
54/// <https://github.com/ChrisDenton/winarg/tree/std>.
55fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
56 lp_cmd_line: Option<WStrUnits<'a>>,
57 exe_name: F,
58) -> Vec<OsString> {
59 const BACKSLASH: NonZero<u16> = NonZero::new(b'\\' as u16).unwrap();
60 const QUOTE: NonZero<u16> = NonZero::new(b'"' as u16).unwrap();
61 const TAB: NonZero<u16> = NonZero::new(b'\t' as u16).unwrap();
62 const SPACE: NonZero<u16> = NonZero::new(b' ' as u16).unwrap();
63
64 let mut ret_val = Vec::new();
65 // If the cmd line pointer is null or it points to an empty string then
66 // return the name of the executable as argv[0].
67 if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
68 ret_val.push(exe_name());
69 return ret_val;
70 }
71 let mut code_units = lp_cmd_line.unwrap();
72
73 // The executable name at the beginning is special.
74 let mut in_quotes = false;
75 let mut cur = Vec::new();
76 for w in &mut code_units {
77 match w {
78 // A quote mark always toggles `in_quotes` no matter what because
79 // there are no escape characters when parsing the executable name.
80 QUOTE => in_quotes = !in_quotes,
81 // If not `in_quotes` then whitespace ends argv[0].
82 SPACE | TAB if !in_quotes => break,
83 // In all other cases the code unit is taken literally.
84 _ => cur.push(w.get()),
85 }
86 }
87 // Skip whitespace.
88 code_units.advance_while(|w| w == SPACE || w == TAB);
89 ret_val.push(OsString::from_wide(&cur));
90
91 // Parse the arguments according to these rules:
92 // * All code units are taken literally except space, tab, quote and backslash.
93 // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
94 // treated as a single separator.
95 // * A space or tab `in_quotes` is taken literally.
96 // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
97 // * A quote can be escaped if preceded by an odd number of backslashes.
98 // * If any number of backslashes is immediately followed by a quote then the number of
99 // backslashes is halved (rounding down).
100 // * Backslashes not followed by a quote are all taken literally.
101 // * If `in_quotes` then a quote can also be escaped using another quote
102 // (i.e. two consecutive quotes become one literal quote).
103 let mut cur = Vec::new();
104 let mut in_quotes = false;
105 while let Some(w) = code_units.next() {
106 match w {
107 // If not `in_quotes`, a space or tab ends the argument.
108 SPACE | TAB if !in_quotes => {
109 ret_val.push(OsString::from_wide(&cur[..]));
110 cur.truncate(0);
111
112 // Skip whitespace.
113 code_units.advance_while(|w| w == SPACE || w == TAB);
114 }
115 // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
116 BACKSLASH => {
117 let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
118 if code_units.peek() == Some(QUOTE) {
119 cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
120 // The quote is escaped if there are an odd number of backslashes.
121 if backslash_count % 2 == 1 {
122 code_units.next();
123 cur.push(QUOTE.get());
124 }
125 } else {
126 // If there is no quote on the end then there is no escaping.
127 cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
128 }
129 }
130 // If `in_quotes` and not backslash escaped (see above) then a quote either
131 // unsets `in_quote` or is escaped by another quote.
132 QUOTE if in_quotes => match code_units.peek() {
133 // Two consecutive quotes when `in_quotes` produces one literal quote.
134 Some(QUOTE) => {
135 cur.push(QUOTE.get());
136 code_units.next();
137 }
138 // Otherwise set `in_quotes`.
139 Some(_) => in_quotes = false,
140 // The end of the command line.
141 // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
142 None => break,
143 },
144 // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
145 QUOTE => in_quotes = true,
146 // Everything else is always taken literally.
147 _ => cur.push(w.get()),
148 }
149 }
150 // Push the final argument, if any.
151 if !cur.is_empty() || in_quotes {
152 ret_val.push(OsString::from_wide(&cur[..]));
153 }
154 ret_val
155}
156
157#[derive(Debug)]
158pub(crate) enum Arg {
159 /// Add quotes (if needed)
160 Regular(OsString),
161 /// Append raw string without quoting
162 Raw(OsString),
163}
164
165enum Quote {
166 // Every arg is quoted
167 Always,
168 // Whitespace and empty args are quoted
169 Auto,
170 // Arg appended without any changes (#29494)
171 Never,
172}
173
174pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()> {
175 let (arg, quote) = match arg {
176 Arg::Regular(arg) => (arg, if force_quotes { Quote::Always } else { Quote::Auto }),
177 Arg::Raw(arg) => (arg, Quote::Never),
178 };
179
180 // If an argument has 0 characters then we need to quote it to ensure
181 // that it actually gets passed through on the command line or otherwise
182 // it will be dropped entirely when parsed on the other end.
183 ensure_no_nuls(arg)?;
184 let arg_bytes = arg.as_encoded_bytes();
185 let (quote, escape) = match quote {
186 Quote::Always => (true, true),
187 Quote::Auto => {
188 (arg_bytes.iter().any(|c| *c == b' ' || *c == b'\t') || arg_bytes.is_empty(), true)
189 }
190 Quote::Never => (false, false),
191 };
192 if quote {
193 cmd.push('"' as u16);
194 }
195
196 let mut backslashes: usize = 0;
197 for x in arg.encode_wide() {
198 if escape {
199 if x == '\\' as u16 {
200 backslashes += 1;
201 } else {
202 if x == '"' as u16 {
203 // Add n+1 backslashes to total 2n+1 before internal '"'.
204 cmd.extend((0..=backslashes).map(|_| '\\' as u16));
205 }
206 backslashes = 0;
207 }
208 }
209 cmd.push(x);
210 }
211
212 if quote {
213 // Add n backslashes to total 2n before ending '"'.
214 cmd.extend((0..backslashes).map(|_| '\\' as u16));
215 cmd.push('"' as u16);
216 }
217 Ok(())
218}
219
220fn append_bat_arg(cmd: &mut Vec<u16>, arg: &OsStr, mut quote: bool) -> io::Result<()> {
221 ensure_no_nuls(arg)?;
222 // If an argument has 0 characters then we need to quote it to ensure
223 // that it actually gets passed through on the command line or otherwise
224 // it will be dropped entirely when parsed on the other end.
225 //
226 // We also need to quote the argument if it ends with `\` to guard against
227 // bat usage such as `"%~2"` (i.e. force quote arguments) otherwise a
228 // trailing slash will escape the closing quote.
229 if arg.is_empty() || arg.as_encoded_bytes().last() == Some(&b'\\') {
230 quote = true;
231 }
232 for cp in arg.as_inner().inner.code_points() {
233 if let Some(cp) = cp.to_char() {
234 // Rather than trying to find every ascii symbol that must be quoted,
235 // we assume that all ascii symbols must be quoted unless they're known to be good.
236 // We also quote Unicode control blocks for good measure.
237 // Note an unquoted `\` is fine so long as the argument isn't otherwise quoted.
238 static UNQUOTED: &str = r"#$*+-./:?@\_";
239 let ascii_needs_quotes =
240 cp.is_ascii() && !(cp.is_ascii_alphanumeric() || UNQUOTED.contains(cp));
241 if ascii_needs_quotes || cp.is_control() {
242 quote = true;
243 }
244 }
245 }
246
247 if quote {
248 cmd.push('"' as u16);
249 }
250 // Loop through the string, escaping `\` only if followed by `"`.
251 // And escaping `"` by doubling them.
252 let mut backslashes: usize = 0;
253 for x in arg.encode_wide() {
254 if x == '\\' as u16 {
255 backslashes += 1;
256 } else {
257 if x == '"' as u16 {
258 // Add n backslashes to total 2n before internal `"`.
259 cmd.extend((0..backslashes).map(|_| '\\' as u16));
260 // Appending an additional double-quote acts as an escape.
261 cmd.push(b'"' as u16)
262 } else if x == '%' as u16 || x == '\r' as u16 {
263 // yt-dlp hack: replaces `%` with `%%cd:~,%` to stop %VAR% being expanded as an environment variable.
264 //
265 // # Explanation
266 //
267 // cmd supports extracting a substring from a variable using the following syntax:
268 // %variable:~start_index,end_index%
269 //
270 // In the above command `cd` is used as the variable and the start_index and end_index are left blank.
271 // `cd` is a built-in variable that dynamically expands to the current directory so it's always available.
272 // Explicitly omitting both the start and end index creates a zero-length substring.
273 //
274 // Therefore it all resolves to nothing. However, by doing this no-op we distract cmd.exe
275 // from potentially expanding %variables% in the argument.
276 cmd.extend_from_slice(&[
277 '%' as u16, '%' as u16, 'c' as u16, 'd' as u16, ':' as u16, '~' as u16,
278 ',' as u16,
279 ]);
280 }
281 backslashes = 0;
282 }
283 cmd.push(x);
284 }
285 if quote {
286 // Add n backslashes to total 2n before ending `"`.
287 cmd.extend((0..backslashes).map(|_| '\\' as u16));
288 cmd.push('"' as u16);
289 }
290 Ok(())
291}
292
293pub(crate) fn make_bat_command_line(
294 script: &[u16],
295 args: &[Arg],
296 force_quotes: bool,
297) -> io::Result<Vec<u16>> {
298 const INVALID_ARGUMENT_ERROR: io::Error =
299 io::const_error!(io::ErrorKind::InvalidInput, r#"batch file arguments are invalid"#);
300 // Set the start of the command line to `cmd.exe /c "`
301 // It is necessary to surround the command in an extra pair of quotes,
302 // hence the trailing quote here. It will be closed after all arguments
303 // have been added.
304 // Using /e:ON enables "command extensions" which is essential for the `%` hack to work.
305 let mut cmd: Vec<u16> = "cmd.exe /e:ON /v:OFF /d /c \"".encode_utf16().collect();
306
307 // Push the script name surrounded by its quote pair.
308 cmd.push(b'"' as u16);
309 // Windows file names cannot contain a `"` character or end with `\\`.
310 // If the script name does then return an error.
311 if script.contains(&(b'"' as u16)) || script.last() == Some(&(b'\\' as u16)) {
312 return Err(io::const_error!(
313 io::ErrorKind::InvalidInput,
314 "Windows file names may not contain `\"` or end with `\\`"
315 ));
316 }
317 cmd.extend_from_slice(script.strip_suffix(&[0]).unwrap_or(script));
318 cmd.push(b'"' as u16);
319
320 // Append the arguments.
321 // FIXME: This needs tests to ensure that the arguments are properly
322 // reconstructed by the batch script by default.
323 for arg in args {
324 cmd.push(' ' as u16);
325 match arg {
326 Arg::Regular(arg_os) => {
327 let arg_bytes = arg_os.as_encoded_bytes();
328 // Disallow \r and \n as they may truncate the arguments.
329 const DISALLOWED: &[u8] = b"\r\n";
330 if arg_bytes.iter().any(|c| DISALLOWED.contains(c)) {
331 return Err(INVALID_ARGUMENT_ERROR);
332 }
333 append_bat_arg(&mut cmd, arg_os, force_quotes)?;
334 }
335 _ => {
336 // Raw arguments are passed on as-is.
337 // It's the user's responsibility to properly handle arguments in this case.
338 append_arg(&mut cmd, arg, force_quotes)?;
339 }
340 };
341 }
342
343 // Close the quote we left opened earlier.
344 cmd.push(b'"' as u16);
345
346 Ok(cmd)
347}
348
349/// Takes a path and tries to return a non-verbatim path.
350///
351/// This is necessary because cmd.exe does not support verbatim paths.
352pub(crate) fn to_user_path(path: &Path) -> io::Result<Vec<u16>> {
353 from_wide_to_user_path(to_u16s(path)?)
354}
355pub(crate) fn from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>> {
356 // UTF-16 encoded code points, used in parsing and building UTF-16 paths.
357 // All of these are in the ASCII range so they can be cast directly to `u16`.
358 const SEP: u16 = b'\\' as _;
359 const QUERY: u16 = b'?' as _;
360 const COLON: u16 = b':' as _;
361 const U: u16 = b'U' as _;
362 const N: u16 = b'N' as _;
363 const C: u16 = b'C' as _;
364
365 // Early return if the path is too long to remove the verbatim prefix.
366 const LEGACY_MAX_PATH: usize = 260;
367 if path.len() > LEGACY_MAX_PATH {
368 return Ok(path);
369 }
370
371 match &path[..] {
372 // `\\?\C:\...` => `C:\...`
373 [SEP, SEP, QUERY, SEP, _, COLON, SEP, ..] => unsafe {
374 let lpfilename = path[4..].as_ptr();
375 fill_utf16_buf(
376 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
377 |full_path: &[u16]| {
378 if full_path == &path[4..path.len() - 1] {
379 let mut path: Vec<u16> = full_path.into();
380 path.push(0);
381 path
382 } else {
383 path
384 }
385 },
386 )
387 },
388 // `\\?\UNC\...` => `\\...`
389 [SEP, SEP, QUERY, SEP, U, N, C, SEP, ..] => unsafe {
390 // Change the `C` in `UNC\` to `\` so we can get a slice that starts with `\\`.
391 path[6] = b'\\' as u16;
392 let lpfilename = path[6..].as_ptr();
393 fill_utf16_buf(
394 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
395 |full_path: &[u16]| {
396 if full_path == &path[6..path.len() - 1] {
397 let mut path: Vec<u16> = full_path.into();
398 path.push(0);
399 path
400 } else {
401 // Restore the 'C' in "UNC".
402 path[6] = b'C' as u16;
403 path
404 }
405 },
406 )
407 },
408 // For everything else, leave the path unchanged.
409 _ => get_long_path(path, false),
410 }
411}