I am attempting to learn regex builder. I scape data from a web site, pull out just the table rows <tr> and table data <td> and place them into a string. I have attempted to extract table data with regex builder with no success. For testing I placed 3 scrapped table rows into a multi line string and apply my regex pattern in a for loop. It does not appear to find any matches. I am clearly overlooking something. Below is my code:
func GetHTMLTableData() {
let stringData = """
<tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)><td class=Py(10px) Ta(start) Pend(10px)><span>Jun 30, 2023</span></td><td class=Py(10px) Pstart(10px)><span>405.40</span></td><td class=Py(10px) Pstart(10px)><span>408.22</span></td><td class=Py(10px) Pstart(10px)><span>405.29</span></td><td class=Py(10px) Pstart(10px)><span>407.28</span></td><td class=Py(10px) Pstart(10px)><span>407.28</span></td><td class=Py(10px) Pstart(10px)><span>5,160,100</span></td></tr>
<tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)><td class=Py(10px) Ta(start) Pend(10px)><span>Jun 29, 2023</span></td><td class=Py(10px) Pstart(10px)><span>400.60</span></td><td class=Py(10px) Pstart(10px)><span>402.67</span></td><td class=Py(10px) Pstart(10px)><span>400.19</span></td><td class=Py(10px) Pstart(10px)><span>402.51</span></td><td class=Py(10px) Pstart(10px)><span>402.51</span></td><td class=Py(10px) Pstart(10px)><span>3,914,800</span></td></tr>
<tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)><td class=Py(10px) Ta(start) Pend(10px)><span>Jun 28, 2023</span></td><td class=Py(10px) Pstart(10px)><span>401.35</span></td><td class=Py(10px) Pstart(10px)><span>403.49</span></td><td class=Py(10px) Pstart(10px)><span>400.71</span></td><td class=Py(10px) Pstart(10px)><span>402.55</span></td><td class=Py(10px) Pstart(10px)><span>400.97</span></td><td class=Py(10px) Pstart(10px)><span>4,320,700</span></td></tr>
"""
let tradingDayPattern = Regex {
"<tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)>"
"<td class=Py(10px) Ta(start) Pend(10px)><span>"
Capture(.date(format: "\(month: .abbreviated) \(day: .twoDigits), \(year: .extended(minimumLength: 4))", locale: Locale(identifier: "en_US_POSIX") , timeZone: .gmt))
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
","
Repeat(.digit, count: 3)
","
Repeat(.digit, count: 3)
} transform: {
Int($0)
}
"</span></td></tr>"
}
for match in stringData.matches(of: tradingDayPattern) {
let (line, date, open, high, low, close, adjClose, volume ) = match.output
print("\(date) - \(close)")
}
}
After some additional research and testing I discovered that the error in my RegEx Builder was related to attempting to capture the last number in the pattern (x,***,***). So I broke it out into 3 captures and then perform some math in the for loop. There is probably a better way but I have not yet figured it out. In addition, thanks to esilks input, I removed the line feeds from the multi line variable and converted it to a regular string. Clearly a better way. Below is the updated function.
func GetHTMLTableData3() {
let stringData = "<tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)><td class=Py(10px) Ta(start) Pend(10px)><span>Jun 30, 2023</span></td><td class=Py(10px) Pstart(10px)><span>405.40</span></td><td class=Py(10px) Pstart(10px)><span>408.22</span></td><td class=Py(10px) Pstart(10px)><span>405.29</span></td><td class=Py(10px) Pstart(10px)><span>407.28</span></td><td class=Py(10px) Pstart(10px)><span>407.28</span></td><td class=Py(10px) Pstart(10px)><span>5,160,100</span></td></tr><tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)><td class=Py(10px) Ta(start) Pend(10px)><span>Jun 29, 2023</span></td><td class=Py(10px) Pstart(10px)><span>400.60</span></td><td class=Py(10px) Pstart(10px)><span>402.67</span></td><td class=Py(10px) Pstart(10px)><span>400.19</span></td><td class=Py(10px) Pstart(10px)><span>402.51</span></td><td class=Py(10px) Pstart(10px)><span>402.51</span></td><td class=Py(10px) Pstart(10px)><span>3,914,800</span></td></tr><tr class=BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)><td class=Py(10px) Ta(start) Pend(10px)><span>Jun 28, 2023</span></td><td class=Py(10px) Pstart(10px)><span>401.35</span></td><td class=Py(10px) Pstart(10px)><span>403.49</span></td><td class=Py(10px) Pstart(10px)><span>400.71</span></td><td class=Py(10px) Pstart(10px)><span>402.55</span></td><td class=Py(10px) Pstart(10px)><span>400.97</span></td><td class=Py(10px) Pstart(10px)><span>4,320,700</span></td></tr>"
let tradingDayPattern = Regex {
"<td class=Py(10px) Ta(start) Pend(10px)><span>"
Capture(.date(format: "\(month: .abbreviated) \(day: .twoDigits), \(year: .extended(minimumLength: 4))", locale: Locale(identifier: "en_US_POSIX") , timeZone: .gmt))
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
TryCapture {
OneOrMore(.digit)
"."
Repeat(.digit, count: 2)
} transform: {
Double($0)
}
"</span></td><td class=Py(10px) Pstart(10px)><span>"
Capture {
Repeat(.digit, count: 1)
}transform: {
Int($0)
}
","
Capture {
Repeat(.digit, count: 3)
}transform: {
Int($0)
}
","
Capture {
Repeat(.digit, count: 3)
} transform: {
Int($0)
}
"</span>"
}
for match in stringData.matches(of: tradingDayPattern) {
let (line, date, open, high, low, close, adjClose, v1, v2, v3) = match.output
let vol1 = v1! * 1000000
let vol2 = v2! * 1000
let volume = vol1 + vol2 + v3!
print("\(date) - \(close) - \(volume)")
}
}