You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

216 lines
7.4 KiB

  1. // Copyright 2022 The Matrix.org Foundation C.I.C.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. use anyhow::bail;
  15. use anyhow::Context;
  16. use anyhow::Error;
  17. use lazy_static::lazy_static;
  18. use regex;
  19. use regex::Regex;
  20. use regex::RegexBuilder;
  21. lazy_static! {
  22. /// Matches runs of non-wildcard characters followed by wildcard characters.
  23. static ref WILDCARD_RUN: Regex = Regex::new(r"([^\?\*]*)([\?\*]*)").expect("valid regex");
  24. }
  25. /// Extract the localpart from a Matrix style ID
  26. pub(crate) fn get_localpart_from_id(id: &str) -> Result<&str, Error> {
  27. let (localpart, _) = id
  28. .split_once(':')
  29. .with_context(|| format!("ID does not contain colon: {id}"))?;
  30. // We need to strip off the first character, which is the ID type.
  31. if localpart.is_empty() {
  32. bail!("Invalid ID {id}");
  33. }
  34. Ok(&localpart[1..])
  35. }
  36. /// Used by `glob_to_regex` to specify what to match the regex against.
  37. #[derive(Debug, Clone, Copy, PartialEq, Eq)]
  38. pub enum GlobMatchType {
  39. /// The generated regex will match against the entire input.
  40. Whole,
  41. /// The generated regex will match against words.
  42. Word,
  43. }
  44. /// Convert a "glob" style expression to a regex, anchoring either to the entire
  45. /// input or to individual words.
  46. pub fn glob_to_regex(glob: &str, match_type: GlobMatchType) -> Result<Regex, Error> {
  47. let mut chunks = Vec::new();
  48. // Patterns with wildcards must be simplified to avoid performance cliffs
  49. // - The glob `?**?**?` is equivalent to the glob `???*`
  50. // - The glob `???*` is equivalent to the regex `.{3,}`
  51. for captures in WILDCARD_RUN.captures_iter(glob) {
  52. if let Some(chunk) = captures.get(1) {
  53. chunks.push(regex::escape(chunk.as_str()));
  54. }
  55. if let Some(wildcards) = captures.get(2) {
  56. if wildcards.as_str() == "" {
  57. continue;
  58. }
  59. let question_marks = wildcards.as_str().chars().filter(|c| *c == '?').count();
  60. if wildcards.as_str().contains('*') {
  61. chunks.push(format!(".{{{question_marks},}}"));
  62. } else {
  63. chunks.push(format!(".{{{question_marks}}}"));
  64. }
  65. }
  66. }
  67. let joined = chunks.join("");
  68. let regex_str = match match_type {
  69. GlobMatchType::Whole => format!(r"\A{joined}\z"),
  70. // `^|\W` and `\W|$` handle the case where `pattern` starts or ends with a non-word
  71. // character.
  72. GlobMatchType::Word => format!(r"(?:^|\b|\W){joined}(?:\b|\W|$)"),
  73. };
  74. Ok(RegexBuilder::new(&regex_str)
  75. .case_insensitive(true)
  76. .build()?)
  77. }
  78. /// Compiles the glob into a `Matcher`.
  79. pub fn get_glob_matcher(glob: &str, match_type: GlobMatchType) -> Result<Matcher, Error> {
  80. // There are a number of shortcuts we can make if the glob doesn't contain a
  81. // wild card.
  82. let matcher = if glob.contains(['*', '?']) {
  83. let regex = glob_to_regex(glob, match_type)?;
  84. Matcher::Regex(regex)
  85. } else if match_type == GlobMatchType::Whole {
  86. // If there aren't any wildcards and we're matching the whole thing,
  87. // then we simply can do a case-insensitive string match.
  88. Matcher::Whole(glob.to_lowercase())
  89. } else {
  90. // Otherwise, if we're matching against words then can first check
  91. // if the haystack contains the glob at all.
  92. Matcher::Word {
  93. word: glob.to_lowercase(),
  94. regex: None,
  95. }
  96. };
  97. Ok(matcher)
  98. }
  99. /// Matches against a glob
  100. pub enum Matcher {
  101. /// Plain regex matching.
  102. Regex(Regex),
  103. /// Case-insensitive equality.
  104. Whole(String),
  105. /// Word matching. `regex` is a cache of calling [`glob_to_regex`] on word.
  106. Word { word: String, regex: Option<Regex> },
  107. }
  108. impl Matcher {
  109. /// Checks if the glob matches the given haystack.
  110. pub fn is_match(&mut self, haystack: &str) -> Result<bool, Error> {
  111. // We want to to do case-insensitive matching, so we convert to
  112. // lowercase first.
  113. let haystack = haystack.to_lowercase();
  114. match self {
  115. Matcher::Regex(regex) => Ok(regex.is_match(&haystack)),
  116. Matcher::Whole(whole) => Ok(whole == &haystack),
  117. Matcher::Word { word, regex } => {
  118. // If we're looking for a literal word, then we first check if
  119. // the haystack contains the word as a substring.
  120. if !haystack.contains(&*word) {
  121. return Ok(false);
  122. }
  123. // If it does contain the word as a substring, then we need to
  124. // check if it is an actual word by testing it against the regex.
  125. let regex = if let Some(regex) = regex {
  126. regex
  127. } else {
  128. let compiled_regex = glob_to_regex(word, GlobMatchType::Word)?;
  129. regex.insert(compiled_regex)
  130. };
  131. Ok(regex.is_match(&haystack))
  132. }
  133. }
  134. }
  135. }
  136. #[test]
  137. fn test_get_domain_from_id() {
  138. get_localpart_from_id("").unwrap_err();
  139. get_localpart_from_id(":").unwrap_err();
  140. get_localpart_from_id(":asd").unwrap_err();
  141. get_localpart_from_id("::as::asad").unwrap_err();
  142. assert_eq!(get_localpart_from_id("@test:foo").unwrap(), "test");
  143. assert_eq!(get_localpart_from_id("@:").unwrap(), "");
  144. assert_eq!(get_localpart_from_id("@test:foo:907").unwrap(), "test");
  145. }
  146. #[test]
  147. fn tset_glob() -> Result<(), Error> {
  148. assert_eq!(
  149. glob_to_regex("simple", GlobMatchType::Whole)?.as_str(),
  150. r"\Asimple\z"
  151. );
  152. assert_eq!(
  153. glob_to_regex("simple*", GlobMatchType::Whole)?.as_str(),
  154. r"\Asimple.{0,}\z"
  155. );
  156. assert_eq!(
  157. glob_to_regex("simple?", GlobMatchType::Whole)?.as_str(),
  158. r"\Asimple.{1}\z"
  159. );
  160. assert_eq!(
  161. glob_to_regex("simple?*?*", GlobMatchType::Whole)?.as_str(),
  162. r"\Asimple.{2,}\z"
  163. );
  164. assert_eq!(
  165. glob_to_regex("simple???", GlobMatchType::Whole)?.as_str(),
  166. r"\Asimple.{3}\z"
  167. );
  168. assert_eq!(
  169. glob_to_regex("escape.", GlobMatchType::Whole)?.as_str(),
  170. r"\Aescape\.\z"
  171. );
  172. assert!(glob_to_regex("simple", GlobMatchType::Whole)?.is_match("simple"));
  173. assert!(!glob_to_regex("simple", GlobMatchType::Whole)?.is_match("simples"));
  174. assert!(glob_to_regex("simple*", GlobMatchType::Whole)?.is_match("simples"));
  175. assert!(glob_to_regex("simple?", GlobMatchType::Whole)?.is_match("simples"));
  176. assert!(glob_to_regex("simple*", GlobMatchType::Whole)?.is_match("simple"));
  177. assert!(glob_to_regex("simple", GlobMatchType::Word)?.is_match("some simple."));
  178. assert!(glob_to_regex("simple", GlobMatchType::Word)?.is_match("simple"));
  179. assert!(!glob_to_regex("simple", GlobMatchType::Word)?.is_match("simples"));
  180. assert!(glob_to_regex("@user:foo", GlobMatchType::Word)?.is_match("Some @user:foo test"));
  181. assert!(glob_to_regex("@user:foo", GlobMatchType::Word)?.is_match("@user:foo"));
  182. Ok(())
  183. }