Scraper SEO: planilha para raspar dados de URL

Primeiro passo é você copiar a planilha [Faça sua cópia da planilha Scraper SEO]
2. Abra a aba “script” da sua planilha Sheets, vá em Extensão > Apps Script
3. Vai abrir a página Apps Script, veja se tem esse script colado na sua planilha.
Essa planilha tem um BUG quando você copia ela, então COPIE e COLE novamente o script dentro “code”. Depois salve!
Esse aqui é o código para você copiar e colar:
/**
 * Simple Google Sheets SEO scraper to extract data from a URL.
 *
 * @param {"https://keywordsinsheets.com"} url - input the target URL.
 * @param {false} allHeadings [OPTIONAL] Select whether you would like the first heading returned, or all headings returned in a comma-separated list. Default is set to false.
 * @param {false} displayLinks [OPTIONAL] Select whether you would like the internal and external links returned as comma-separated lists (truncated to 5,000 character cell limit). Default is set to false.
 * @param {"body"} selector [OPTIONAL] Include a CSS selector to scrape custom content.
 * @customfunction
 */

function seoscraper(url, allHeadings, displayLinks, selector) {
  // Validate the URL
  if (!url) {
    return "You need to enter a valid URL.";
  } else if (!url.includes("http")) {
    return "You need to include the URL protocol eg. HTTP or HTTPs.";
  }

  // Default values for allHeadings and displayLinks
  allHeadings = allHeadings || false;
  displayLinks = displayLinks || false;

  // Data array to store the extracted attributes
  data = [];

  try {
    // Fetch the URL content
    const fetch = UrlFetchApp.fetch(url, {
      muteHttpExceptions: true,
      followRedirects: false
    });

    // Load the fetched content into Cheerio
    const content = fetch.getContentText();
    const $ = Cheerio.load(content);

    // Get the body text
    const body = $("body").text();

    // Function to format text
    const trimText = (text) => text.trim().toLowerCase();

    // Get the response status code
    const status = fetch.getResponseCode();

    // Extract various SEO attributes
    const title = $("title").text().trim();
    const description = $("meta[name='description']").attr("content");
    const canonical = $("link[rel='canonical']").attr("href");
    const robots = $("meta[name='robots']").attr("content");
    const wordCount = body.trim().split(/\s+/).length;
    const h1 = trimText($("h1").first().text());
    const h2 = allHeadings ? $("h2").map((i, e) => trimText($(e).text())).get().toString() : trimText($("h2").first().text());
    const h3 = allHeadings ? $("h3").map((i, e) => trimText($(e).text())).get().toString() : trimText($("h3").first().text());

    // Extract content from the provided CSS selector
    const customContent = $(selector).text();

    // Function to truncate links
    function truncateLinks(links) {
      let result = '';
      const limit = 50000; // Google Sheets character limit
      const separator = ', ';

      for (let i = 0; i < links.length; i++) {
        let temp = result + links[i] + separator;
        if (temp.length > limit) {
          break;
        }
        result = temp;
      }
      return result;
    }

    // Function to extract domain from a URL
    function getDomain(url) {
      var domain;
      // If URL contains "://", split by "/" and get the third element
      if (url.indexOf("://") > -1) {
        domain = url.split('/')[2];
      } else {
        // Else, split by "/" and get the first element
        domain = url.split('/')[0];
      }
      // Split domain by ":" and get the first element
      domain = domain.split(':')[0];
      return domain;
    }

    // Get base domain of the input URL
    const baseDomain = getDomain(url);

    // Variables to store internal and external links count
    let internalLinksCount = 0;
    let externalLinksCount = 0;

    // Sets to store unique internal and external links
    let internalLinks = new Set();
    let externalLinks = new Set();

    // Iterate over all anchor tags
    $('a').each((i, link) => {
      const href = $(link).attr('href');
      if (href) {
        const linkDomain = getDomain(href);
        // If link domain matches base domain or href starts with '/', it's an internal link
        if (linkDomain === baseDomain || href.startsWith('/')) {
          internalLinks.add(href.startsWith('/') ? `${url}${href}` : href);
          internalLinksCount++;
        } else if (href.startsWith('http')) {
          // If href starts with 'http', it's an external link
          externalLinks.add(href);
          externalLinksCount++;
        }
      }
    });

    // Convert Sets to Arrays for further processing
    internalLinks = Array.from(internalLinks);
    externalLinks = Array.from(externalLinks);

    // Truncate links to fit within 5000 characters and count links
    internalLinks = truncateLinks(internalLinks);
    externalLinks = truncateLinks(externalLinks);

    // Indexability checks
    let indexability = "indexable";
    let indexabilityStatus = "";

    if (status >= 300 && status < 400) {
      indexability = "non-indexable";
      indexabilityStatus = "redirect";
    } else if (status >= 400 && status < 500) {
      indexability = "non-indexable";
      indexabilityStatus = "client error";
    } else if (status >= 500) {
      indexability = "non-indexable";
      indexabilityStatus = "server error";
    }

    if (robots && (robots.toLowerCase().includes("noindex") || robots.toLowerCase().includes("none"))) {
      indexability = "non-indexable";
      indexabilityStatus = "noindex directive";
    }

    if (canonical && canonical !== url) {
      indexability = "non-indexable";
      indexabilityStatus = "canonicalisation";
    }

    // Array to store all schema types
    let schemaTypes = [];

    // Recursive function to extract primary @type properties from an object
    function extractTypes(data) {
      if (typeof data === 'object' && data !== null) {
        if (data["@type"]) {
          schemaTypes.push(data["@type"]);
        }
      }
    }

    // Find all script tags with type application/ld+json
    $('script[type="application/ld+json"]').each((i, elem) => {
      try {
        // Parse the JSON content
        let structuredData = JSON.parse($(elem).html());

        // If structuredData is an array, extract @type from each item
        if (Array.isArray(structuredData)) {
          structuredData.forEach(item => extractTypes(item));
        } else {
          // If it's an object, extract @type directly
          extractTypes(structuredData);
        }

      } catch (error) {
        // Ignore any errors (e.g., if the JSON parsing fails)
        Logger.log("Error parsing JSON: " + error);
      }
    });

    // If status is 200, push all attributes into the data array
    if (status === 200) {
      const resultData = [status, indexability, indexabilityStatus, title, description, h1, h2, h3, robots, canonical, wordCount, internalLinksCount, externalLinksCount];

      // If displayLinks is set to true, include the internal and external links
      if (displayLinks) {
        resultData.push(internalLinks, externalLinks);
      }

      // Push custom content last
      resultData.push(schemaTypes.toString(), customContent);

      data.push(resultData);
    } else {
      // If not, only push the status
      data.push([status, indexability, indexabilityStatus]);
    }

    // Return the data array
    return data;

  } catch (err) {
    // If any error occurs during the scraping process, return an error message
    return "Cannot scrape URL";
  }
}
4. Para operar sua planilha basta colocar a URL na coluna A e colocar o código de raspagem abaixo:
=seoscraper(A1,VERDADEIRO,VERDADEIRO) insira esse código na coluna “STATUS code“
Veja o vídeo abaixo para ver como operar sua planilha Scraper SEO.