page.js

  1. import { aggregatePagination, pagination, api, parseContent } from './util';
  2. import infoboxParser from 'infobox-parser';
  3. import { tokenize, constructTree } from 'hyntax';
  4. import { parseCoordinates } from './coordinates';
  5. import QueryChain from './chain';
  6. const get = (obj, first, ...rest) => {
  7. if (obj === undefined || first === undefined) return obj;
  8. if (typeof first === 'function') {
  9. return get(first(obj), ...rest);
  10. }
  11. return get(obj[first], ...rest);
  12. };
  13. const firstValue = obj => {
  14. if (typeof obj === 'object') return obj[Object.keys(obj)[0]];
  15. return obj[0];
  16. };
  17. const getFileName = text => {
  18. if (Array.isArray(text)) text = text[0];
  19. if (!text) return undefined;
  20. if (text.indexOf(':') !== -1) {
  21. const [, name] = text.split(':');
  22. return name;
  23. }
  24. return text;
  25. };
  26. /**
  27. * WikiPage
  28. * @namespace WikiPage
  29. */
  30. export default function wikiPage(rawPageInfo, apiOptions) {
  31. const raw = rawPageInfo;
  32. /**
  33. * HTML from page
  34. * @example
  35. * wiki.page('batman').then(page => page.html()).then(console.log);
  36. * @method WikiPage#html
  37. * @return {Promise}
  38. */
  39. function html() {
  40. return api(apiOptions, {
  41. prop: 'revisions',
  42. rvprop: 'content',
  43. rvlimit: 1,
  44. rvparse: '',
  45. titles: raw.title
  46. }).then(res => res.query.pages[raw.pageid].revisions[0]['*']);
  47. }
  48. /**
  49. * @summary Useful for extracting structured section content from the page
  50. * @alias sections
  51. * @example
  52. * wiki.page('batman').then(page => page.content()).then(console.log);
  53. * @method WikiPage#content
  54. * @return {Promise}
  55. */
  56. function content() {
  57. return rawContent().then(parseContent);
  58. }
  59. /**
  60. * Raw content from page
  61. * @example
  62. * wiki.page('batman').then(page => page.rawContent()).then(console.log);
  63. * @method WikiPage#rawContent
  64. * @return {Promise}
  65. */
  66. function rawContent() {
  67. return chain()
  68. .content()
  69. .request()
  70. .then(res => res.extract);
  71. }
  72. /**
  73. * Text summary from page
  74. * @example
  75. * wiki.page('batman').then(page => page.summary()).then(console.log);
  76. * @method WikiPage#summary
  77. * @return {Promise}
  78. */
  79. function summary() {
  80. return chain()
  81. .summary()
  82. .request()
  83. .then(res => res.extract);
  84. }
  85. /**
  86. * Main page image directly from API
  87. * @method WikiPage#pageImage
  88. * @returns URL
  89. */
  90. function pageImage() {
  91. return chain()
  92. .image({ original: true, name: true })
  93. .request()
  94. .then(res => get(res, 'image', 'original', 'source'));
  95. }
  96. /**
  97. * Raw data from images from page
  98. * @example
  99. * wiki.page('batman').then(page => page.rawImages()).then(console.log);
  100. * @method WikiPage#rawImages
  101. * @return {Promise}
  102. */
  103. function rawImages() {
  104. return api(apiOptions, {
  105. generator: 'images',
  106. gimlimit: 'max',
  107. prop: 'imageinfo',
  108. iiprop: 'url',
  109. titles: raw.title
  110. }).then(res => {
  111. if (res.query) {
  112. return Object.keys(res.query.pages).map(id => res.query.pages[id]);
  113. }
  114. return [];
  115. });
  116. }
  117. /**
  118. * Main image URL from infobox on page
  119. * @example
  120. * wiki.page('batman').then(page => page.mainImage()).then(console.log);
  121. * @method WikiPage#mainImage
  122. * @return {Promise}
  123. */
  124. function mainImage() {
  125. return Promise.all([rawImages(), info()]).then(([images, info]) => {
  126. // Handle different translations of "image" here
  127. const mainImageName = getFileName(
  128. info.image ||
  129. info.bildname ||
  130. info.imagen ||
  131. info.Immagine ||
  132. info.badge ||
  133. info.logo
  134. );
  135. // Handle case where no info box exists
  136. if (!mainImageName) {
  137. return rawInfo().then(text => {
  138. if (!images.length) return undefined;
  139. // Sort images by what is seen first in page's info text
  140. images.sort((a, b) => text.indexOf(b.title) - text.indexOf(a.title));
  141. const image = images[0];
  142. const fallback =
  143. image && image.imageinfo.length > 0
  144. ? image.imageinfo[0].url
  145. : undefined;
  146. // If no image could be found, fallback to page image api result
  147. return pageImage().then(url => url || fallback);
  148. });
  149. }
  150. const image = images.find(({ title }) => {
  151. const filename = getFileName(title);
  152. // Some wikis use underscores for spaces, some don't
  153. return (
  154. filename.toUpperCase() === mainImageName.toUpperCase() ||
  155. filename.replace(/\s/g, '_') === mainImageName
  156. );
  157. });
  158. const fallback =
  159. image && image.imageinfo.length > 0
  160. ? image.imageinfo[0].url
  161. : undefined;
  162. // If no image could be found, fallback to page image api result
  163. return pageImage().then(url => url || fallback);
  164. });
  165. }
  166. /**
  167. * Image URL's from page
  168. * @example
  169. * wiki.page('batman').then(page => page.image()).then(console.log);
  170. * @method WikiPage#images
  171. * @return {Promise}
  172. */
  173. function images() {
  174. return rawImages().then(images => {
  175. return images
  176. .map(image => image.imageinfo)
  177. .reduce((imageInfos, list) => [...imageInfos, ...list], [])
  178. .map(info => info.url);
  179. });
  180. }
  181. /**
  182. * External links from page
  183. * @example
  184. * wiki().page('batman').then(page => page.externalLinks()).then(console.log);
  185. * // or
  186. * wiki().chain().search('batman').extlinks().request()
  187. * @method WikiPage#externalLinks
  188. * @return {Promise}
  189. */
  190. function externalLinks() {
  191. return chain().direct('extlinks');
  192. }
  193. function hasClass(node, className) {
  194. return (
  195. node.content.attributes &&
  196. node.content.attributes.some(
  197. attr =>
  198. attr.key.content === 'class' &&
  199. attr.value.content.indexOf(className) !== -1
  200. )
  201. );
  202. }
  203. function isTag(node) {
  204. return node.nodeType === 'tag';
  205. }
  206. function hasName(node, name) {
  207. return node.content.name === name;
  208. }
  209. function findNode(node, predicate) {
  210. if (predicate(node)) return node;
  211. // search through children as well
  212. if (node.content.children) {
  213. for (let child of node.content.children) {
  214. const found = findNode(child, predicate);
  215. if (found) {
  216. return found;
  217. }
  218. }
  219. }
  220. return null;
  221. }
  222. function findNodes(node, predicate, nodes) {
  223. if (predicate(node)) {
  224. nodes.push(node);
  225. }
  226. if (node.content.children) {
  227. for (let child of node.content.children) {
  228. findNodes(child, predicate, nodes);
  229. }
  230. }
  231. }
  232. /**
  233. * References from page
  234. * @example
  235. * wiki().page('batman').then(page => page.references()).then(console.log);
  236. * @method WikiPage#references
  237. * @return {Promise}
  238. */
  239. function references() {
  240. return html()
  241. .then(inputHTML => {
  242. const { tokens } = tokenize(inputHTML);
  243. const { ast } = constructTree(tokens);
  244. return ast;
  245. })
  246. .then(ast => {
  247. const links = [];
  248. const refs = [];
  249. // There can be mulitple reference sections
  250. findNodes(
  251. ast,
  252. node =>
  253. isTag(node) && hasName(node, 'ol') && hasClass(node, 'references'),
  254. refs
  255. );
  256. for (let ref of refs) {
  257. const items = ref.content.children.filter(
  258. el => isTag(el) && hasName(el, 'li') && el.content.children
  259. );
  260. for (let item of items) {
  261. // The reference was moved under a span under li
  262. const span = item.content.children[2];
  263. const cite = findNode(
  264. span,
  265. node => isTag(node) && hasName(node, 'cite')
  266. );
  267. if (cite) {
  268. for (let el of cite.content.children) {
  269. if (isTag(el) && hasName(el, 'a') && hasClass(el, 'external')) {
  270. const linkAttr = el.content.attributes.find(
  271. attr => attr.key.content === 'href'
  272. );
  273. links.push(linkAttr.value.content);
  274. }
  275. }
  276. }
  277. }
  278. }
  279. return links;
  280. });
  281. }
  282. /**
  283. * Paginated links from page
  284. * @example
  285. * wiki().page('batman').then(page => page.links()).then(console.log);
  286. * @method WikiPage#links
  287. * @param {Boolean} [aggregated] - return all links (default is true)
  288. * @param {Number} [limit] - number of links per page
  289. * @return {Promise} - returns results if aggregated [and next function for more results if not aggregated]
  290. */
  291. function links(aggregated = true, limit = 100) {
  292. const _pagination = pagination(
  293. apiOptions,
  294. {
  295. prop: 'links',
  296. plnamespace: 0,
  297. pllimit: limit,
  298. titles: raw.title
  299. },
  300. res => (res.query.pages[raw.pageid].links || []).map(link => link.title)
  301. );
  302. if (aggregated) {
  303. return aggregatePagination(_pagination);
  304. }
  305. return _pagination;
  306. }
  307. /**
  308. * Paginated categories from page
  309. * @example
  310. * wiki().page('batman').then(page => page.categories()).then(console.log);
  311. * @method WikiPage#categories
  312. * @param {Boolean} [aggregated] - return all categories (default is true)
  313. * @param {Number} [limit] - number of categories per page
  314. * @return {Promise} - returns results if aggregated [and next function for more results if not aggregated]
  315. */
  316. function categories(aggregated = true, limit = 100) {
  317. const _pagination = pagination(
  318. apiOptions,
  319. chain()
  320. .categories(limit)
  321. .params(),
  322. res =>
  323. (res.query.pages[raw.pageid].categories || []).map(
  324. category => category.title
  325. )
  326. );
  327. if (aggregated) {
  328. return aggregatePagination(_pagination);
  329. }
  330. return _pagination;
  331. }
  332. /**
  333. * Geographical coordinates from page
  334. * @example
  335. * wiki().page('Texas').then(texas => texas.coordinates())
  336. * @method WikiPage#coordinates
  337. * @return {Promise}
  338. */
  339. function coordinates() {
  340. return chain()
  341. .direct('coordinates')
  342. .then(coords => {
  343. if (coords) return coords;
  344. // No coordinates for this page, check infobox for deprecated version
  345. return info().then(data => parseCoordinates(data));
  346. });
  347. }
  348. function rawInfo(title) {
  349. return api(apiOptions, {
  350. prop: 'revisions',
  351. rvprop: 'content',
  352. rvsection: 0,
  353. titles: title || raw.title
  354. }).then(res => get(res, 'query', 'pages', firstValue, 'revisions', 0, '*'));
  355. }
  356. /**
  357. * Fetch and parse tables within page
  358. * @method WikiPage#tables
  359. * @return {Promise} Resolves to a collection of tables
  360. */
  361. function tables() {
  362. return api(apiOptions, {
  363. prop: 'revisions',
  364. rvprop: 'content',
  365. titles: raw.title
  366. })
  367. .then(res => get(res, 'query', 'pages', firstValue, 'revisions', 0, '*'))
  368. .then(wikitext => infoboxParser(wikitext, apiOptions.parser).tables);
  369. }
  370. /**
  371. * Get general information from page, with optional specifc property
  372. * @deprecated This method will be dropped and replaced with the `fullInfo` implementation in v5
  373. * @example
  374. * wiki().page('Batman').then(page => page.info('alter_ego'));
  375. * @method WikiPage#info
  376. * @param {String} [key] - Information key. Falsy keys are ignored
  377. * @return {Promise} - info Object contains key/value pairs of infobox data, or specific value if key given
  378. */
  379. function info(key) {
  380. return rawInfo()
  381. .then(wikitext => {
  382. // Use general data for now...
  383. const info = infoboxParser(wikitext, apiOptions.parser).general;
  384. if (Object.keys(info).length === 0) {
  385. // If empty, check to see if this page has a templated infobox
  386. return rawInfo(`Template:Infobox ${raw.title.toLowerCase()}`).then(
  387. _wikitext =>
  388. infoboxParser(_wikitext || '', apiOptions.parser).general
  389. );
  390. }
  391. return info;
  392. })
  393. .then(metadata => {
  394. if (!key) {
  395. return metadata;
  396. }
  397. if (metadata.hasOwnProperty(key)) {
  398. return metadata[key];
  399. }
  400. });
  401. }
  402. /**
  403. * Get the full infobox data, parsed in a easy to use manner
  404. * @example
  405. * new Wiki().page('Batman').then(page => page.fullInfo()).then(info => info.general.aliases);
  406. * @method WikiPage#fullInfo
  407. * @return {Promise} - Parsed object of all infobox data
  408. */
  409. function fullInfo() {
  410. return rawInfo().then(wikitext =>
  411. infoboxParser(wikitext, apiOptions.parser)
  412. );
  413. }
  414. /**
  415. * Paginated backlinks from page
  416. * @method WikiPage#backlinks
  417. * @param {Boolean} [aggregated] - return all backlinks (default is true)
  418. * @param {Number} [limit] - number of backlinks per page
  419. * @return {Promise} - includes results [and next function for more results if not aggregated]
  420. */
  421. function backlinks(aggregated = true, limit = 100) {
  422. const _pagination = pagination(
  423. apiOptions,
  424. {
  425. list: 'backlinks',
  426. bllimit: limit,
  427. bltitle: raw.title
  428. },
  429. res => (res.query.backlinks || []).map(link => link.title)
  430. );
  431. if (aggregated) {
  432. return aggregatePagination(_pagination);
  433. }
  434. return _pagination;
  435. }
  436. /**
  437. * Get list of links to different translations
  438. * @method WikiPage#langlinks
  439. * @return {Promise} - includes link objects { lang, title, url }
  440. */
  441. function langlinks() {
  442. return chain().direct('langlinks');
  443. }
  444. /**
  445. * Get URL for wiki page
  446. * @method WikiPage#url
  447. * @return {String}
  448. */
  449. function url() {
  450. return raw.canonicalurl;
  451. }
  452. const page = Object.assign({}, raw);
  453. /**
  454. * Returns a QueryChain for the page
  455. * @method WikiPage#chain
  456. * @returns {QueryChain}
  457. */
  458. function chain() {
  459. return new QueryChain(apiOptions, raw.pageid);
  460. }
  461. Object.assign(page, {
  462. raw,
  463. html,
  464. rawContent,
  465. content,
  466. sections: content,
  467. summary,
  468. images,
  469. references,
  470. links,
  471. externalLinks,
  472. categories,
  473. coordinates,
  474. info,
  475. backlinks,
  476. rawImages,
  477. mainImage,
  478. langlinks,
  479. rawInfo,
  480. fullInfo,
  481. pageImage,
  482. tables,
  483. url,
  484. chain
  485. });
  486. return page;
  487. }