- Primeiro passo é você copiar a planilha [Faça sua cópia da planilha Scraper SEO]
2. Abra a aba “script” da sua planilha Sheets, vá em Extensão > Apps Script
3. Vai abrir a página Apps Script, veja se tem esse script colado na sua planilha.
Essa planilha tem um BUG quando você copia ela, então COPIE e COLE novamente o script dentro “code”. Depois salve!
Esse aqui é o código para você copiar e colar:
/**
* Simple Google Sheets SEO scraper to extract data from a URL.
*
* @param {"https://keywordsinsheets.com"} url - input the target URL.
* @param {false} allHeadings [OPTIONAL] Select whether you would like the first heading returned, or all headings returned in a comma-separated list. Default is set to false.
* @param {false} displayLinks [OPTIONAL] Select whether you would like the internal and external links returned as comma-separated lists (truncated to 5,000 character cell limit). Default is set to false.
* @param {"body"} selector [OPTIONAL] Include a CSS selector to scrape custom content.
* @customfunction
*/
function seoscraper(url, allHeadings, displayLinks, selector) {
// Validate the URL
if (!url) {
return "You need to enter a valid URL.";
} else if (!url.includes("http")) {
return "You need to include the URL protocol eg. HTTP or HTTPs.";
}
// Default values for allHeadings and displayLinks
allHeadings = allHeadings || false;
displayLinks = displayLinks || false;
// Data array to store the extracted attributes
data = [];
try {
// Fetch the URL content
const fetch = UrlFetchApp.fetch(url, {
muteHttpExceptions: true,
followRedirects: false
});
// Load the fetched content into Cheerio
const content = fetch.getContentText();
const $ = Cheerio.load(content);
// Get the body text
const body = $("body").text();
// Function to format text
const trimText = (text) => text.trim().toLowerCase();
// Get the response status code
const status = fetch.getResponseCode();
// Extract various SEO attributes
const title = $("title").text().trim();
const description = $("meta[name='description']").attr("content");
const canonical = $("link[rel='canonical']").attr("href");
const robots = $("meta[name='robots']").attr("content");
const wordCount = body.trim().split(/\s+/).length;
const h1 = trimText($("h1").first().text());
const h2 = allHeadings ? $("h2").map((i, e) => trimText($(e).text())).get().toString() : trimText($("h2").first().text());
const h3 = allHeadings ? $("h3").map((i, e) => trimText($(e).text())).get().toString() : trimText($("h3").first().text());
// Extract content from the provided CSS selector
const customContent = $(selector).text();
// Function to truncate links
function truncateLinks(links) {
let result = '';
const limit = 50000; // Google Sheets character limit
const separator = ', ';
for (let i = 0; i < links.length; i++) {
let temp = result + links[i] + separator;
if (temp.length > limit) {
break;
}
result = temp;
}
return result;
}
// Function to extract domain from a URL
function getDomain(url) {
var domain;
// If URL contains "://", split by "/" and get the third element
if (url.indexOf("://") > -1) {
domain = url.split('/')[2];
} else {
// Else, split by "/" and get the first element
domain = url.split('/')[0];
}
// Split domain by ":" and get the first element
domain = domain.split(':')[0];
return domain;
}
// Get base domain of the input URL
const baseDomain = getDomain(url);
// Variables to store internal and external links count
let internalLinksCount = 0;
let externalLinksCount = 0;
// Sets to store unique internal and external links
let internalLinks = new Set();
let externalLinks = new Set();
// Iterate over all anchor tags
$('a').each((i, link) => {
const href = $(link).attr('href');
if (href) {
const linkDomain = getDomain(href);
// If link domain matches base domain or href starts with '/', it's an internal link
if (linkDomain === baseDomain || href.startsWith('/')) {
internalLinks.add(href.startsWith('/') ? `${url}${href}` : href);
internalLinksCount++;
} else if (href.startsWith('http')) {
// If href starts with 'http', it's an external link
externalLinks.add(href);
externalLinksCount++;
}
}
});
// Convert Sets to Arrays for further processing
internalLinks = Array.from(internalLinks);
externalLinks = Array.from(externalLinks);
// Truncate links to fit within 5000 characters and count links
internalLinks = truncateLinks(internalLinks);
externalLinks = truncateLinks(externalLinks);
// Indexability checks
let indexability = "indexable";
let indexabilityStatus = "";
if (status >= 300 && status < 400) {
indexability = "non-indexable";
indexabilityStatus = "redirect";
} else if (status >= 400 && status < 500) {
indexability = "non-indexable";
indexabilityStatus = "client error";
} else if (status >= 500) {
indexability = "non-indexable";
indexabilityStatus = "server error";
}
if (robots && (robots.toLowerCase().includes("noindex") || robots.toLowerCase().includes("none"))) {
indexability = "non-indexable";
indexabilityStatus = "noindex directive";
}
if (canonical && canonical !== url) {
indexability = "non-indexable";
indexabilityStatus = "canonicalisation";
}
// Array to store all schema types
let schemaTypes = [];
// Recursive function to extract primary @type properties from an object
function extractTypes(data) {
if (typeof data === 'object' && data !== null) {
if (data["@type"]) {
schemaTypes.push(data["@type"]);
}
}
}
// Find all script tags with type application/ld+json
$('script[type="application/ld+json"]').each((i, elem) => {
try {
// Parse the JSON content
let structuredData = JSON.parse($(elem).html());
// If structuredData is an array, extract @type from each item
if (Array.isArray(structuredData)) {
structuredData.forEach(item => extractTypes(item));
} else {
// If it's an object, extract @type directly
extractTypes(structuredData);
}
} catch (error) {
// Ignore any errors (e.g., if the JSON parsing fails)
Logger.log("Error parsing JSON: " + error);
}
});
// If status is 200, push all attributes into the data array
if (status === 200) {
const resultData = [status, indexability, indexabilityStatus, title, description, h1, h2, h3, robots, canonical, wordCount, internalLinksCount, externalLinksCount];
// If displayLinks is set to true, include the internal and external links
if (displayLinks) {
resultData.push(internalLinks, externalLinks);
}
// Push custom content last
resultData.push(schemaTypes.toString(), customContent);
data.push(resultData);
} else {
// If not, only push the status
data.push([status, indexability, indexabilityStatus]);
}
// Return the data array
return data;
} catch (err) {
// If any error occurs during the scraping process, return an error message
return "Cannot scrape URL";
}
}
4. Para operar sua planilha basta colocar a URL na coluna A e colocar o código de raspagem abaixo:
=seoscraper(A1,VERDADEIRO,VERDADEIRO) insira esse código na coluna “STATUS code“
Veja o vídeo abaixo para ver como operar sua planilha Scraper SEO.