Coverage for aisdb/webdata/marinetraffic.py: 100%

34 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-30 04:14 +0000

1''' scrape vessel information such as deadweight tonnage from marinetraffic.com ''' 

2 

3import os 

4 

5import numpy as np 

6from selenium.webdriver.firefox.webdriver import WebDriver 

7from selenium.webdriver.support.ui import WebDriverWait 

8from selenium.webdriver.common.by import By 

9from selenium.common.exceptions import TimeoutException 

10 

11from aisdb import sqlpath 

12from aisdb.webdata._scraper import _scraper 

13from aisdb.database.dbconn import PostgresDBConn, SQLiteDBConn 

14import sqlite3 

15 

16baseurl = 'https://www.marinetraffic.com/' 

17 

18_err404 = 'INSERT INTO webdata_marinetraffic(mmsi, error404) ' 

19_err404 += 'VALUES (CAST(? as INT), 1)' 

20 

21_createtable_sqlfile = os.path.join(sqlpath, 

22 'createtable_webdata_marinetraffic.sql') 

23with open(_createtable_sqlfile, 'r') as f: 

24 _createtable_sql = f.read() 

25 

26_insert_sqlfile = os.path.join(sqlpath, 'insert_webdata_marinetraffic.sql') 

27with open(_insert_sqlfile, 'r') as f: 

28 _insert_sql = f.read() 

29 

30 

31def _nullinfo(track): 

32 return { 

33 'mmsi': 

34 track['mmsi'], 

35 'imo': 

36 track['imo'] if 'imo' in track.keys() else 0, 

37 'name': (track['vessel_name'] if 'vessel_name' in track.keys() 

38 and track['vessel_name'] is not None else ''), 

39 'vesseltype_generic': 

40 None, 

41 'vesseltype_detailed': 

42 None, 

43 'callsign': 

44 None, 

45 'flag': 

46 None, 

47 'gross_tonnage': 

48 None, 

49 'summer_dwt': 

50 None, 

51 'length_breadth': 

52 None, 

53 'year_built': 

54 None, 

55 'home_port': 

56 None, 

57 'error404': 

58 1 

59 } 

60 

61 

62def _loaded(drv: WebDriver) -> bool: # pragma: no cover 

63 asset_type = 'asset_type' in drv.current_url 

64 e404 = '404' == drv.title[0:3] 

65 exists = drv.find_elements( 

66 by='id', 

67 value='vesselDetails_voyageInfoSection', 

68 ) 

69 return (exists or e404 or asset_type) 

70 

71 

72def _updateinfo(info: str, vessel: dict) -> None: # pragma: no cover 

73 i = info.split(': ') 

74 if len(i) < 2: 

75 return 

76 vessel[i[0]] = i[1] 

77 

78 

79def _getrow(vessel: dict) -> tuple: # pragma: no cover 

80 if 'MMSI' not in vessel.keys() or vessel['MMSI'] == '-': 

81 vessel['MMSI'] = 0 

82 if 'IMO' not in vessel.keys() or vessel['IMO'] == '-': 

83 vessel['IMO'] = 0 

84 if 'Name' not in vessel.keys(): 

85 vessel['Name'] = '' 

86 if 'Call Sign' not in vessel.keys(): 

87 vessel['Call Sign'] = '' 

88 if 'Gross Tonnage' not in vessel.keys() or vessel['Gross Tonnage'] == '-': 

89 vessel['Gross Tonnage'] = 0 

90 elif ('Gross Tonnage' in vessel.keys() 

91 and isinstance(vessel['Gross Tonnage'], str)): 

92 vessel['Gross Tonnage'] = int(vessel['Gross Tonnage'].split()[0]) 

93 if 'Summer DWT' not in vessel.keys() or vessel['Summer DWT'] == '-': 

94 vessel['Summer DWT'] = 0 

95 elif ('Summer DWT' in vessel.keys() 

96 and isinstance(vessel['Summer DWT'], str)): 

97 vessel['Summer DWT'] = int(vessel['Summer DWT'].split()[0]) 

98 if 'Year Built' not in vessel.keys() or vessel['Year Built'] == '-': 

99 vessel['Year Built'] = 0 

100 return ( 

101 int(vessel['MMSI']), 

102 int(vessel['IMO']), 

103 vessel['Name'], 

104 vessel['Vessel Type - Generic'], 

105 vessel['Vessel Type - Detailed'], 

106 vessel['Call Sign'], 

107 vessel['Flag'], 

108 int(vessel['Gross Tonnage']), 

109 int(vessel['Summer DWT']), 

110 vessel['Length Overall x Breadth Extreme'], 

111 int(vessel['Year Built']), 

112 vessel['Home Port'], 

113 ) 

114 

115 

116def _insertvesselrow(elem, mmsi, trafficDB): # pragma: no cover 

117 vessel = {} 

118 for info in elem.text.split('\n'): 

119 _updateinfo(info, vessel) 

120 if len(vessel.keys()) < 11: 

121 return 

122 insertrow = _getrow(vessel) 

123 

124 print(insertrow) 

125 

126 with trafficDB as conn: 

127 conn.execute(_insert_sql, insertrow).fetchall() 

128 

129 

130def _vessel_info_dict(dbconn) -> dict: 

131 if isinstance(dbconn, SQLiteDBConn): 

132 # raise ValueError(f"Invalid database connection type: {dbconn}") 

133 if not hasattr(dbconn, 'trafficdb'): 

134 raise ValueError( 

135 'Database connection does not have an attached traffic database!' 

136 ) 

137 cur = dbconn.cursor() 

138 cur.execute( 

139 'SELECT * FROM sqlite_master ' 

140 'WHERE type="table" AND name LIKE "ais\\_%\\_dynamic" ESCAPE "\\" ' 

141 ) 

142 alias = dbconn._get_dbname(dbconn.trafficdb) + '.' 

143 elif isinstance(dbconn, sqlite3.Connection): 

144 alias = '' 

145 elif isinstance(dbconn, SQLiteDBConn): 

146 alias = '' 

147 else: 

148 raise ValueError(f"Invalid connection type: {dbconn}") 

149 cur = dbconn.cursor() 

150 res = cur.execute( 

151 f'SELECT * FROM {alias}webdata_marinetraffic WHERE error404 != 1' 

152 ).fetchall() 

153 info_dict = {r['mmsi']: r for r in res} 

154 return info_dict 

155 

156 

157def vessel_info(tracks: iter, dbconn: sqlite3.Connection): 

158 ''' append metadata scraped from marinetraffic.com to track dictionaries. 

159 

160 See :meth:`aisdb.database.dbqry.DBQuery.check_marinetraffic` for a 

161 high-level method to retrieve metadata for all vessels observed within 

162 a specific query time and region, or alternatively see 

163 :meth:`aisdb.webdata.marinetraffic.VesselInfo.vessel_info_callback` 

164 for scraping metadata for a given list of MMSIs 

165 

166 args: 

167 tracks (iter) 

168 collection of track dictionaries 

169 dbconn (ConnectionType) 

170 Either a :class:`aisdb.database.dbconn.SQLiteDBConn` or 

171 :class:`aisdb.database.dbconn.PostgresDBConn` database 

172 connection objects 

173 ''' 

174 if not isinstance(dbconn, sqlite3.Connection): 

175 raise ValueError( 

176 f"Invalid database connection type: {dbconn}. Requires: {sqlite3.Connection}" 

177 ) 

178 meta = _vessel_info_dict(dbconn) 

179 for track in tracks: 

180 assert isinstance(track, dict) 

181 track['static'] = set(track['static']).union({'marinetraffic_info'}) 

182 if track['mmsi'] in meta.keys(): 

183 track['marinetraffic_info'] = meta[track['mmsi']] 

184 else: 

185 track['marinetraffic_info'] = _nullinfo(track) 

186 yield track 

187 

188 

189class VesselInfo(): # pragma: no cover 

190 ''' scrape vessel metadata from marinetraffic.com 

191 

192 args: 

193 trafficDBpath (string) 

194 path where vessel traffic metadata should be stored 

195 

196 See :meth:`aisdb.database.dbqry.DBQuery.check_marinetraffic` for a 

197 high-level method to retrieve metadata for all vessels observed within 

198 a specific query time and region, or alternatively see 

199 :meth:`aisdb.webdata.marinetraffic.VesselInfo.vessel_info_callback` 

200 for scraping metadata for a given list of MMSIs 

201 ''' 

202 

203 def __init__(self, trafficDBpath, verbose=False): 

204 self.driver = None 

205 wd = os.path.dirname(trafficDBpath) 

206 if not os.path.isdir(wd): # pragma: no cover 

207 if verbose: print(f'creating directory: {wd}') 

208 os.makedirs(wd) 

209 self.trafficDB = sqlite3.Connection(trafficDBpath) 

210 self.trafficDB.row_factory = sqlite3.Row 

211 

212 # create a new info table if it doesnt exist yet 

213 with self.trafficDB as conn: 

214 conn.execute(_createtable_sql) 

215 

216 def __enter__(self): 

217 return self 

218 

219 def __exit__(self, exc_type, exc_value, tb): 

220 if self.driver is not None: 

221 self.driver.close() 

222 self.driver.quit() 

223 

224 def _getinfo(self, *, url, searchmmsi, infotxt=''): 

225 if self.driver is None: 

226 self.driver = _scraper() 

227 

228 print(infotxt + url, end='\t') 

229 try: 

230 self.driver.get(url) 

231 WebDriverWait(self.driver, 15).until(_loaded) 

232 except TimeoutException: 

233 print(f'timed out, skipping {searchmmsi=}') 

234 

235 # if timeout occurs, mark as error 404 

236 with self.trafficDB as conn: 

237 conn.execute(_err404, (str(searchmmsi), )) 

238 return 

239 except Exception as err: 

240 self.driver.close() 

241 self.driver.quit() 

242 raise err 

243 

244 # recurse through vessel listings if multiple vessels appear 

245 if 'asset_type' in self.driver.current_url: 

246 print('recursing...') 

247 

248 urls = [] 

249 for elem in self.driver.find_elements( 

250 By.CLASS_NAME, 

251 value='ag-cell-content-link', 

252 ): 

253 urls.append(elem.get_attribute('href')) 

254 

255 for url in urls: 

256 self._getinfo(url=url, searchmmsi=searchmmsi) 

257 

258 with self.trafficDB as conn: 

259 insert404 = conn.execute( 

260 'SELECT COUNT(*) FROM webdata_marinetraffic WHERE mmsi=?', 

261 (str(searchmmsi), )).fetchone()[0] == 0 

262 if insert404: 

263 conn.execute(_err404, (str(searchmmsi), )) 

264 

265 elif 'hc-en' in self.driver.current_url: 

266 raise RuntimeError('bad url??') 

267 

268 elif self.driver.title[0:3] == '404': 

269 print(f'404 error! {searchmmsi=}') 

270 with self.trafficDB as conn: 

271 conn.execute(_err404, (str(searchmmsi), )) 

272 

273 value = 'vesselDetails_vesselInfoSection' 

274 for elem in self.driver.find_elements(value=value): 

275 _ = _insertvesselrow(elem, searchmmsi, self.trafficDB) 

276 

277 def vessel_info_callback(self, mmsis, retry_404=False, infotxt=''): 

278 ''' search for metadata for given mmsis 

279 

280 args: 

281 mmsis (list) 

282 list of MMSI identifiers (integers) 

283 ''' 

284 # only check unique mmsis 

285 mmsis = np.unique(mmsis).astype(int) 

286 print('.', end='') # second dot (first in dbqry.py) 

287 

288 # check existing 

289 sqlcount = 'SELECT mmsi FROM webdata_marinetraffic \t' 

290 sqlcount += f'WHERE mmsi IN ({",".join(["?" for _ in mmsis])})\n' 

291 if retry_404: 

292 sqlcount += 'AND error404 != 1 \n' 

293 sqlcount += 'ORDER BY mmsi' 

294 with self.trafficDB as conn: 

295 existing = conn.execute(sqlcount, tuple(map(str, 

296 mmsis))).fetchall() 

297 print('.') # third dot 

298 

299 # skip existing mmsis 

300 ex_mmsis = np.array(existing).flatten() 

301 xor_mmsis = np.setdiff1d(mmsis, ex_mmsis, assume_unique=True) 

302 if xor_mmsis.size == 0: 

303 return 

304 

305 for mmsi in xor_mmsis: 

306 if not 200000000 <= mmsi <= 780000000: 

307 continue 

308 url = f'{baseurl}en/ais/details/ships/mmsi:{mmsi}' 

309 self._getinfo(url=url, searchmmsi=mmsi, infotxt=infotxt) 

310 

311 return 

312 

313 

314''' 

315# validate IMO 

316if searchimo != 0: 

317 checksum = str( 

318 np.sum( 

319 np.array(list(map(int, list(str(searchimo)[:-1])))) * 

320 np.array([7, 6, 5, 4, 3, 2])))[-1] 

321else: 

322 checksum = '0' 

323'''