Coverage for aisdb/webdata/marinetraffic.py: 100%
34 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-30 04:14 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-30 04:14 +0000
1''' scrape vessel information such as deadweight tonnage from marinetraffic.com '''
3import os
5import numpy as np
6from selenium.webdriver.firefox.webdriver import WebDriver
7from selenium.webdriver.support.ui import WebDriverWait
8from selenium.webdriver.common.by import By
9from selenium.common.exceptions import TimeoutException
11from aisdb import sqlpath
12from aisdb.webdata._scraper import _scraper
13from aisdb.database.dbconn import PostgresDBConn, SQLiteDBConn
14import sqlite3
16baseurl = 'https://www.marinetraffic.com/'
18_err404 = 'INSERT INTO webdata_marinetraffic(mmsi, error404) '
19_err404 += 'VALUES (CAST(? as INT), 1)'
21_createtable_sqlfile = os.path.join(sqlpath,
22 'createtable_webdata_marinetraffic.sql')
23with open(_createtable_sqlfile, 'r') as f:
24 _createtable_sql = f.read()
26_insert_sqlfile = os.path.join(sqlpath, 'insert_webdata_marinetraffic.sql')
27with open(_insert_sqlfile, 'r') as f:
28 _insert_sql = f.read()
31def _nullinfo(track):
32 return {
33 'mmsi':
34 track['mmsi'],
35 'imo':
36 track['imo'] if 'imo' in track.keys() else 0,
37 'name': (track['vessel_name'] if 'vessel_name' in track.keys()
38 and track['vessel_name'] is not None else ''),
39 'vesseltype_generic':
40 None,
41 'vesseltype_detailed':
42 None,
43 'callsign':
44 None,
45 'flag':
46 None,
47 'gross_tonnage':
48 None,
49 'summer_dwt':
50 None,
51 'length_breadth':
52 None,
53 'year_built':
54 None,
55 'home_port':
56 None,
57 'error404':
58 1
59 }
62def _loaded(drv: WebDriver) -> bool: # pragma: no cover
63 asset_type = 'asset_type' in drv.current_url
64 e404 = '404' == drv.title[0:3]
65 exists = drv.find_elements(
66 by='id',
67 value='vesselDetails_voyageInfoSection',
68 )
69 return (exists or e404 or asset_type)
72def _updateinfo(info: str, vessel: dict) -> None: # pragma: no cover
73 i = info.split(': ')
74 if len(i) < 2:
75 return
76 vessel[i[0]] = i[1]
79def _getrow(vessel: dict) -> tuple: # pragma: no cover
80 if 'MMSI' not in vessel.keys() or vessel['MMSI'] == '-':
81 vessel['MMSI'] = 0
82 if 'IMO' not in vessel.keys() or vessel['IMO'] == '-':
83 vessel['IMO'] = 0
84 if 'Name' not in vessel.keys():
85 vessel['Name'] = ''
86 if 'Call Sign' not in vessel.keys():
87 vessel['Call Sign'] = ''
88 if 'Gross Tonnage' not in vessel.keys() or vessel['Gross Tonnage'] == '-':
89 vessel['Gross Tonnage'] = 0
90 elif ('Gross Tonnage' in vessel.keys()
91 and isinstance(vessel['Gross Tonnage'], str)):
92 vessel['Gross Tonnage'] = int(vessel['Gross Tonnage'].split()[0])
93 if 'Summer DWT' not in vessel.keys() or vessel['Summer DWT'] == '-':
94 vessel['Summer DWT'] = 0
95 elif ('Summer DWT' in vessel.keys()
96 and isinstance(vessel['Summer DWT'], str)):
97 vessel['Summer DWT'] = int(vessel['Summer DWT'].split()[0])
98 if 'Year Built' not in vessel.keys() or vessel['Year Built'] == '-':
99 vessel['Year Built'] = 0
100 return (
101 int(vessel['MMSI']),
102 int(vessel['IMO']),
103 vessel['Name'],
104 vessel['Vessel Type - Generic'],
105 vessel['Vessel Type - Detailed'],
106 vessel['Call Sign'],
107 vessel['Flag'],
108 int(vessel['Gross Tonnage']),
109 int(vessel['Summer DWT']),
110 vessel['Length Overall x Breadth Extreme'],
111 int(vessel['Year Built']),
112 vessel['Home Port'],
113 )
116def _insertvesselrow(elem, mmsi, trafficDB): # pragma: no cover
117 vessel = {}
118 for info in elem.text.split('\n'):
119 _updateinfo(info, vessel)
120 if len(vessel.keys()) < 11:
121 return
122 insertrow = _getrow(vessel)
124 print(insertrow)
126 with trafficDB as conn:
127 conn.execute(_insert_sql, insertrow).fetchall()
130def _vessel_info_dict(dbconn) -> dict:
131 if isinstance(dbconn, SQLiteDBConn):
132 # raise ValueError(f"Invalid database connection type: {dbconn}")
133 if not hasattr(dbconn, 'trafficdb'):
134 raise ValueError(
135 'Database connection does not have an attached traffic database!'
136 )
137 cur = dbconn.cursor()
138 cur.execute(
139 'SELECT * FROM sqlite_master '
140 'WHERE type="table" AND name LIKE "ais\\_%\\_dynamic" ESCAPE "\\" '
141 )
142 alias = dbconn._get_dbname(dbconn.trafficdb) + '.'
143 elif isinstance(dbconn, sqlite3.Connection):
144 alias = ''
145 elif isinstance(dbconn, SQLiteDBConn):
146 alias = ''
147 else:
148 raise ValueError(f"Invalid connection type: {dbconn}")
149 cur = dbconn.cursor()
150 res = cur.execute(
151 f'SELECT * FROM {alias}webdata_marinetraffic WHERE error404 != 1'
152 ).fetchall()
153 info_dict = {r['mmsi']: r for r in res}
154 return info_dict
157def vessel_info(tracks: iter, dbconn: sqlite3.Connection):
158 ''' append metadata scraped from marinetraffic.com to track dictionaries.
160 See :meth:`aisdb.database.dbqry.DBQuery.check_marinetraffic` for a
161 high-level method to retrieve metadata for all vessels observed within
162 a specific query time and region, or alternatively see
163 :meth:`aisdb.webdata.marinetraffic.VesselInfo.vessel_info_callback`
164 for scraping metadata for a given list of MMSIs
166 args:
167 tracks (iter)
168 collection of track dictionaries
169 dbconn (ConnectionType)
170 Either a :class:`aisdb.database.dbconn.SQLiteDBConn` or
171 :class:`aisdb.database.dbconn.PostgresDBConn` database
172 connection objects
173 '''
174 if not isinstance(dbconn, sqlite3.Connection):
175 raise ValueError(
176 f"Invalid database connection type: {dbconn}. Requires: {sqlite3.Connection}"
177 )
178 meta = _vessel_info_dict(dbconn)
179 for track in tracks:
180 assert isinstance(track, dict)
181 track['static'] = set(track['static']).union({'marinetraffic_info'})
182 if track['mmsi'] in meta.keys():
183 track['marinetraffic_info'] = meta[track['mmsi']]
184 else:
185 track['marinetraffic_info'] = _nullinfo(track)
186 yield track
189class VesselInfo(): # pragma: no cover
190 ''' scrape vessel metadata from marinetraffic.com
192 args:
193 trafficDBpath (string)
194 path where vessel traffic metadata should be stored
196 See :meth:`aisdb.database.dbqry.DBQuery.check_marinetraffic` for a
197 high-level method to retrieve metadata for all vessels observed within
198 a specific query time and region, or alternatively see
199 :meth:`aisdb.webdata.marinetraffic.VesselInfo.vessel_info_callback`
200 for scraping metadata for a given list of MMSIs
201 '''
203 def __init__(self, trafficDBpath, verbose=False):
204 self.driver = None
205 wd = os.path.dirname(trafficDBpath)
206 if not os.path.isdir(wd): # pragma: no cover
207 if verbose: print(f'creating directory: {wd}')
208 os.makedirs(wd)
209 self.trafficDB = sqlite3.Connection(trafficDBpath)
210 self.trafficDB.row_factory = sqlite3.Row
212 # create a new info table if it doesnt exist yet
213 with self.trafficDB as conn:
214 conn.execute(_createtable_sql)
216 def __enter__(self):
217 return self
219 def __exit__(self, exc_type, exc_value, tb):
220 if self.driver is not None:
221 self.driver.close()
222 self.driver.quit()
224 def _getinfo(self, *, url, searchmmsi, infotxt=''):
225 if self.driver is None:
226 self.driver = _scraper()
228 print(infotxt + url, end='\t')
229 try:
230 self.driver.get(url)
231 WebDriverWait(self.driver, 15).until(_loaded)
232 except TimeoutException:
233 print(f'timed out, skipping {searchmmsi=}')
235 # if timeout occurs, mark as error 404
236 with self.trafficDB as conn:
237 conn.execute(_err404, (str(searchmmsi), ))
238 return
239 except Exception as err:
240 self.driver.close()
241 self.driver.quit()
242 raise err
244 # recurse through vessel listings if multiple vessels appear
245 if 'asset_type' in self.driver.current_url:
246 print('recursing...')
248 urls = []
249 for elem in self.driver.find_elements(
250 By.CLASS_NAME,
251 value='ag-cell-content-link',
252 ):
253 urls.append(elem.get_attribute('href'))
255 for url in urls:
256 self._getinfo(url=url, searchmmsi=searchmmsi)
258 with self.trafficDB as conn:
259 insert404 = conn.execute(
260 'SELECT COUNT(*) FROM webdata_marinetraffic WHERE mmsi=?',
261 (str(searchmmsi), )).fetchone()[0] == 0
262 if insert404:
263 conn.execute(_err404, (str(searchmmsi), ))
265 elif 'hc-en' in self.driver.current_url:
266 raise RuntimeError('bad url??')
268 elif self.driver.title[0:3] == '404':
269 print(f'404 error! {searchmmsi=}')
270 with self.trafficDB as conn:
271 conn.execute(_err404, (str(searchmmsi), ))
273 value = 'vesselDetails_vesselInfoSection'
274 for elem in self.driver.find_elements(value=value):
275 _ = _insertvesselrow(elem, searchmmsi, self.trafficDB)
277 def vessel_info_callback(self, mmsis, retry_404=False, infotxt=''):
278 ''' search for metadata for given mmsis
280 args:
281 mmsis (list)
282 list of MMSI identifiers (integers)
283 '''
284 # only check unique mmsis
285 mmsis = np.unique(mmsis).astype(int)
286 print('.', end='') # second dot (first in dbqry.py)
288 # check existing
289 sqlcount = 'SELECT mmsi FROM webdata_marinetraffic \t'
290 sqlcount += f'WHERE mmsi IN ({",".join(["?" for _ in mmsis])})\n'
291 if retry_404:
292 sqlcount += 'AND error404 != 1 \n'
293 sqlcount += 'ORDER BY mmsi'
294 with self.trafficDB as conn:
295 existing = conn.execute(sqlcount, tuple(map(str,
296 mmsis))).fetchall()
297 print('.') # third dot
299 # skip existing mmsis
300 ex_mmsis = np.array(existing).flatten()
301 xor_mmsis = np.setdiff1d(mmsis, ex_mmsis, assume_unique=True)
302 if xor_mmsis.size == 0:
303 return
305 for mmsi in xor_mmsis:
306 if not 200000000 <= mmsi <= 780000000:
307 continue
308 url = f'{baseurl}en/ais/details/ships/mmsi:{mmsi}'
309 self._getinfo(url=url, searchmmsi=mmsi, infotxt=infotxt)
311 return
314'''
315# validate IMO
316if searchimo != 0:
317 checksum = str(
318 np.sum(
319 np.array(list(map(int, list(str(searchimo)[:-1])))) *
320 np.array([7, 6, 5, 4, 3, 2])))[-1]
321else:
322 checksum = '0'
323'''